diff --git "a/rotary/impls/hf_kernels_rotary.html" "b/rotary/impls/hf_kernels_rotary.html" --- "a/rotary/impls/hf_kernels_rotary.html" +++ "b/rotary/impls/hf_kernels_rotary.html" @@ -3874,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
Mon Nov 10 21:57:39 2025 +Fri Dec 19 18:55:27 2025 +-----------------------------------------------------------------------------------------+ -| NVIDIA-SMI 580.95.05 Driver Version: 580.95.05 CUDA Version: 13.0 | +| NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 | +-----------------------------------------+------------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+========================+======================| | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 | -| N/A 26C P0 88W / 350W | 0MiB / 46068MiB | 22% Default | +| N/A 28C P0 85W / 350W | 0MiB / 46068MiB | 34% Default | | | | N/A | +-----------------------------------------+------------------------+----------------------+ @@ -3938,7 +3938,7 @@ Cell: nv | 0.22s ▼ output ▶ uv-logs | -Cell: benchmark | 4.74s +Cell: benchmark | 34.01s | Raw @@ -4009,23 +4009,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 403.678us 1730.44% 403.678us 403.678us 1 - hf_kernels_rotary 9.63% 231.023us 99.37% 2.384ms 2.384ms 0.000us 0.00% 24.608us 24.608us 1 - _rotary_dba7d1e::apply_rotary 2.18% 52.340us 4.07% 97.602us 16.267us 16.224us 69.55% 16.224us 2.704us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.224us 69.55% 16.224us 2.704us 6 - aten::clone 1.53% 36.662us 83.59% 2.005ms 334.171us 0.000us 0.00% 8.384us 1.397us 6 - aten::copy_ 1.80% 43.260us 79.70% 1.912ms 318.600us 7.104us 30.45% 8.384us 1.397us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.104us 30.45% 7.104us 1.184us 6 - Activity Buffer Request 74.82% 1.795ms 74.82% 1.795ms 1.795ms 1.280us 5.49% 1.280us 1.280us 1 - aten::empty_strided 2.37% 56.761us 2.37% 56.761us 9.460us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 3.07% 73.591us 3.07% 73.591us 12.265us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.65% 39.481us 2.08% 49.901us 4.158us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.43% 10.420us 0.43% 10.420us 0.868us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 1.89% 45.262us 1.89% 45.262us 7.544us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.63% 15.070us 0.63% 15.070us 15.070us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 443.904us 1908.20% 443.904us 443.904us 1 + hf_kernels_rotary 5.70% 250.202us 82.78% 3.636ms 3.636ms 0.000us 0.00% 24.543us 24.543us 1 + _rotary_dba7d1e::apply_rotary 1.26% 55.422us 2.46% 108.103us 18.017us 16.158us 69.46% 16.158us 2.693us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.158us 69.46% 16.158us 2.693us 6 + aten::clone 1.27% 55.831us 73.22% 3.217ms 536.122us 0.000us 0.00% 8.385us 1.398us 6 + aten::copy_ 1.34% 58.721us 52.83% 2.321ms 386.805us 7.105us 30.54% 8.385us 1.398us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.105us 30.54% 7.105us 1.184us 6 + Activity Buffer Request 49.67% 2.182ms 49.67% 2.182ms 2.182ms 1.280us 5.50% 1.280us 1.280us 1 + aten::empty_strided 19.12% 840.074us 19.12% 840.074us 140.012us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 1.82% 80.012us 1.82% 80.012us 13.335us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.10% 48.201us 1.40% 61.410us 5.118us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.30% 13.209us 0.30% 13.209us 1.101us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 1.20% 52.681us 1.20% 52.681us 8.780us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 17.22% 756.622us 17.22% 756.622us 756.622us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.399ms -Self CUDA time total: 23.328us +Self CPU time total: 4.393ms +Self CUDA time total: 23.263us @@ -4035,23 +4035,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 334.494us 1388.06% 334.494us 334.494us 1 - hf_kernels_rotary 8.19% 181.152us 99.73% 2.206ms 2.206ms 0.000us 0.00% 25.410us 25.410us 1 - _rotary_dba7d1e::apply_rotary 1.81% 39.991us 3.60% 79.751us 13.292us 16.193us 67.20% 16.193us 2.699us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.193us 67.20% 16.193us 2.699us 6 - aten::clone 1.33% 29.430us 86.17% 1.906ms 317.722us 0.000us 0.00% 9.217us 1.536us 6 - aten::copy_ 1.70% 37.720us 83.32% 1.843ms 307.237us 7.905us 32.80% 9.217us 1.536us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.905us 32.80% 7.905us 1.317us 6 - Activity Buffer Request 79.13% 1.751ms 79.13% 1.751ms 1.751ms 1.312us 5.44% 1.312us 1.312us 1 - aten::empty_strided 1.51% 33.481us 1.51% 33.481us 5.580us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 2.49% 55.161us 2.49% 55.161us 9.194us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.38% 30.530us 1.77% 39.222us 3.268us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.39% 8.692us 0.39% 8.692us 0.724us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 1.80% 39.760us 1.80% 39.760us 6.627us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.27% 5.870us 0.27% 5.870us 5.870us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 319.740us 1332.31% 319.740us 319.740us 1 + hf_kernels_rotary 7.66% 161.733us 99.74% 2.106ms 2.106ms 0.000us 0.00% 25.311us 25.311us 1 + _rotary_dba7d1e::apply_rotary 1.93% 40.672us 3.87% 81.612us 13.602us 16.127us 67.20% 16.127us 2.688us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.127us 67.20% 16.127us 2.688us 6 + aten::clone 0.96% 20.317us 86.36% 1.823ms 303.883us 0.000us 0.00% 9.184us 1.531us 6 + aten::copy_ 1.65% 34.881us 83.94% 1.772ms 295.377us 7.872us 32.80% 9.184us 1.531us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.872us 32.80% 7.872us 1.312us 6 + Activity Buffer Request 79.76% 1.684ms 79.76% 1.684ms 1.684ms 1.312us 5.47% 1.312us 1.312us 1 + aten::empty_strided 1.46% 30.722us 1.46% 30.722us 5.120us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 2.53% 53.351us 2.53% 53.351us 8.892us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.45% 30.570us 1.85% 39.091us 3.258us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.40% 8.521us 0.40% 8.521us 0.710us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 1.94% 40.940us 1.94% 40.940us 6.823us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.26% 5.581us 0.26% 5.581us 5.581us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.212ms -Self CUDA time total: 24.098us +Self CPU time total: 2.111ms +Self CUDA time total: 23.999us @@ -4061,23 +4061,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 333.020us 1374.81% 333.020us 333.020us 1 - hf_kernels_rotary 8.22% 183.662us 99.77% 2.229ms 2.229ms 0.000us 0.00% 25.535us 25.535us 1 - _rotary_dba7d1e::apply_rotary 1.78% 39.771us 3.54% 79.142us 13.190us 16.479us 68.03% 16.479us 2.747us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.479us 68.03% 16.479us 2.747us 6 - aten::clone 1.23% 27.502us 86.14% 1.925ms 320.808us 0.000us 0.00% 9.056us 1.509us 6 - aten::copy_ 1.51% 33.780us 83.43% 1.864ms 310.723us 7.744us 31.97% 9.056us 1.509us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.744us 31.97% 7.744us 1.291us 6 - Activity Buffer Request 79.60% 1.779ms 79.60% 1.779ms 1.779ms 1.312us 5.42% 1.312us 1.312us 1 - aten::empty_strided 1.48% 33.009us 1.48% 33.009us 5.501us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 2.32% 51.921us 2.32% 51.921us 8.654us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.44% 32.260us 1.87% 41.742us 3.478us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.42% 9.482us 0.42% 9.482us 0.790us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 1.76% 39.371us 1.76% 39.371us 6.562us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.23% 5.150us 0.23% 5.150us 5.150us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 318.846us 1316.19% 318.846us 318.846us 1 + hf_kernels_rotary 7.42% 158.343us 99.75% 2.128ms 2.128ms 0.000us 0.00% 25.505us 25.505us 1 + _rotary_dba7d1e::apply_rotary 1.98% 42.310us 3.93% 83.780us 13.963us 16.544us 68.29% 16.544us 2.757us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.544us 68.29% 16.544us 2.757us 6 + aten::clone 0.91% 19.412us 86.52% 1.845ms 307.560us 0.000us 0.00% 8.961us 1.493us 6 + aten::copy_ 1.63% 34.769us 84.21% 1.796ms 299.358us 7.681us 31.71% 8.961us 1.493us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.681us 31.71% 7.681us 1.280us 6 + Activity Buffer Request 80.16% 1.710ms 80.16% 1.710ms 1.710ms 1.280us 5.28% 1.280us 1.280us 1 + aten::empty_strided 1.40% 29.800us 1.40% 29.800us 4.967us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 2.42% 51.552us 2.42% 51.552us 8.592us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.48% 31.501us 1.89% 40.211us 3.351us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.41% 8.710us 0.41% 8.710us 0.726us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 1.94% 41.470us 1.94% 41.470us 6.912us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.25% 5.300us 0.25% 5.300us 5.300us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.235ms -Self CUDA time total: 24.223us +Self CPU time total: 2.133ms +Self CUDA time total: 24.225us @@ -4087,23 +4087,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 330.396us 1170.66% 330.396us 330.396us 1 - hf_kernels_rotary 19.88% 180.354us 99.43% 901.975us 901.975us 0.000us 0.00% 29.983us 29.983us 1 - _rotary_dba7d1e::apply_rotary 4.33% 39.273us 8.60% 78.013us 13.002us 17.759us 62.92% 17.759us 2.960us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 17.759us 62.92% 17.759us 2.960us 6 - aten::clone 2.43% 22.040us 66.64% 604.579us 100.763us 0.000us 0.00% 12.224us 2.037us 6 - aten::copy_ 3.81% 34.600us 60.79% 551.459us 91.910us 10.464us 37.08% 12.224us 2.037us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.464us 37.08% 10.464us 1.744us 6 - Activity Buffer Request 27.63% 250.684us 27.63% 250.684us 250.684us 1.760us 6.24% 1.760us 1.760us 1 - aten::empty_strided 3.43% 31.080us 3.43% 31.080us 5.180us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 29.34% 266.175us 29.34% 266.175us 44.362us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 3.36% 30.489us 4.30% 39.029us 3.252us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.94% 8.540us 0.94% 8.540us 0.712us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 4.27% 38.740us 4.27% 38.740us 6.457us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.57% 5.209us 0.57% 5.209us 5.209us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 321.313us 1144.93% 321.313us 321.313us 1 + hf_kernels_rotary 6.51% 155.543us 99.79% 2.386ms 2.386ms 0.000us 0.00% 29.792us 29.792us 1 + _rotary_dba7d1e::apply_rotary 1.75% 41.730us 3.43% 82.081us 13.680us 17.695us 63.05% 17.695us 2.949us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 17.695us 63.05% 17.695us 2.949us 6 + aten::clone 0.86% 20.620us 88.15% 2.107ms 351.189us 0.000us 0.00% 12.097us 2.016us 6 + aten::copy_ 1.42% 33.971us 86.01% 2.056ms 342.697us 10.369us 36.95% 12.097us 2.016us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.369us 36.95% 10.369us 1.728us 6 + Activity Buffer Request 74.19% 1.773ms 74.19% 1.773ms 1.773ms 1.728us 6.16% 1.728us 1.728us 1 + aten::empty_strided 1.27% 30.330us 1.27% 30.330us 5.055us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 10.41% 248.804us 10.41% 248.804us 41.467us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.33% 31.699us 1.70% 40.751us 3.396us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.38% 9.052us 0.38% 9.052us 0.754us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 1.69% 40.351us 1.69% 40.351us 6.725us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.21% 5.000us 0.21% 5.000us 5.000us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 907.184us -Self CUDA time total: 28.223us +Self CPU time total: 2.391ms +Self CUDA time total: 28.064us @@ -4113,23 +4113,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 333.950us 1371.35% 333.950us 333.950us 1 - hf_kernels_rotary 7.53% 182.915us 99.79% 2.425ms 2.425ms 0.000us 0.00% 25.664us 25.664us 1 - _rotary_dba7d1e::apply_rotary 1.65% 40.000us 3.26% 79.130us 13.188us 16.545us 67.94% 16.545us 2.758us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.545us 67.94% 16.545us 2.758us 6 - aten::clone 1.26% 30.642us 87.34% 2.122ms 353.721us 0.000us 0.00% 9.119us 1.520us 6 - aten::copy_ 1.47% 35.799us 84.75% 2.059ms 343.229us 7.807us 32.06% 9.119us 1.520us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.807us 32.06% 7.807us 1.301us 6 - Activity Buffer Request 73.06% 1.775ms 73.06% 1.775ms 1.775ms 1.312us 5.39% 1.312us 1.312us 1 - aten::empty_strided 1.33% 32.310us 1.33% 32.310us 5.385us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 10.22% 248.434us 10.22% 248.434us 41.406us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.31% 31.720us 1.66% 40.370us 3.364us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.36% 8.650us 0.36% 8.650us 0.721us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 1.61% 39.130us 1.61% 39.130us 6.522us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.21% 5.100us 0.21% 5.100us 5.100us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 325.341us 1337.81% 325.341us 325.341us 1 + hf_kernels_rotary 6.82% 156.303us 99.76% 2.286ms 2.286ms 0.000us 0.00% 25.599us 25.599us 1 + _rotary_dba7d1e::apply_rotary 1.86% 42.600us 3.70% 84.711us 14.118us 16.512us 67.90% 16.512us 2.752us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.512us 67.90% 16.512us 2.752us 6 + aten::clone 0.89% 20.410us 87.49% 2.005ms 334.180us 0.000us 0.00% 9.087us 1.514us 6 + aten::copy_ 1.54% 35.239us 85.28% 1.954ms 325.734us 7.807us 32.10% 9.087us 1.514us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.807us 32.10% 7.807us 1.301us 6 + Activity Buffer Request 73.15% 1.676ms 73.15% 1.676ms 1.676ms 1.280us 5.26% 1.280us 1.280us 1 + aten::empty_strided 1.32% 30.271us 1.32% 30.271us 5.045us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 10.60% 242.835us 10.60% 242.835us 40.472us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.38% 31.561us 1.75% 40.080us 3.340us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.37% 8.519us 0.37% 8.519us 0.710us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 1.84% 42.111us 1.84% 42.111us 7.018us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.24% 5.490us 0.24% 5.490us 5.490us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.430ms -Self CUDA time total: 24.352us +Self CPU time total: 2.292ms +Self CUDA time total: 24.319us @@ -4139,23 +4139,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 330.717us 1169.19% 330.717us 330.717us 1 - hf_kernels_rotary 7.60% 182.573us 99.80% 2.396ms 2.396ms 0.000us 0.00% 30.046us 30.046us 1 - _rotary_dba7d1e::apply_rotary 1.66% 39.960us 3.28% 78.811us 13.135us 17.758us 62.78% 17.758us 2.960us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 17.758us 62.78% 17.758us 2.960us 6 - aten::clone 1.18% 28.252us 87.25% 2.095ms 349.108us 0.000us 0.00% 12.288us 2.048us 6 - aten::copy_ 1.56% 37.480us 84.78% 2.035ms 339.209us 10.528us 37.22% 12.288us 2.048us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.528us 37.22% 10.528us 1.755us 6 - Activity Buffer Request 73.02% 1.753ms 73.02% 1.753ms 1.753ms 1.760us 6.22% 1.760us 1.760us 1 - aten::empty_strided 1.30% 31.140us 1.30% 31.140us 5.190us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 10.19% 244.675us 10.19% 244.675us 40.779us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.30% 31.158us 1.66% 39.899us 3.325us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.36% 8.741us 0.36% 8.741us 0.728us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 1.62% 38.851us 1.62% 38.851us 6.475us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.20% 4.770us 0.20% 4.770us 4.770us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 323.675us 1148.27% 323.675us 323.675us 1 + hf_kernels_rotary 6.64% 156.971us 99.78% 2.359ms 2.359ms 0.000us 0.00% 29.948us 29.948us 1 + _rotary_dba7d1e::apply_rotary 1.69% 39.892us 3.46% 81.891us 13.648us 17.695us 62.77% 17.695us 2.949us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 17.695us 62.77% 17.695us 2.949us 6 + aten::clone 0.91% 21.443us 87.99% 2.080ms 346.632us 0.000us 0.00% 12.253us 2.042us 6 + aten::copy_ 1.41% 33.321us 85.79% 2.028ms 337.974us 10.493us 37.23% 12.253us 2.042us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.493us 37.23% 10.493us 1.749us 6 + Activity Buffer Request 74.45% 1.760ms 74.45% 1.760ms 1.760ms 1.760us 6.24% 1.760us 1.760us 1 + aten::empty_strided 1.29% 30.510us 1.29% 30.510us 5.085us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 9.92% 234.593us 9.92% 234.593us 39.099us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.33% 31.410us 1.69% 40.001us 3.333us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.36% 8.591us 0.36% 8.591us 0.716us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 1.78% 41.999us 1.78% 41.999us 7.000us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.22% 5.111us 0.22% 5.111us 5.111us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.401ms -Self CUDA time total: 28.286us +Self CPU time total: 2.364ms +Self CUDA time total: 28.188us @@ -4165,23 +4165,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 331.263us 811.96% 331.263us 331.263us 1 - hf_kernels_rotary 7.62% 179.163us 99.79% 2.346ms 2.346ms 0.000us 0.00% 43.646us 43.646us 1 - _rotary_dba7d1e::apply_rotary 1.67% 39.309us 3.29% 77.411us 12.902us 23.680us 58.04% 23.680us 3.947us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 23.680us 58.04% 23.680us 3.947us 6 - aten::clone 1.17% 27.469us 87.14% 2.049ms 341.486us 0.000us 0.00% 19.966us 3.328us 6 - aten::copy_ 1.49% 35.141us 84.62% 1.990ms 331.589us 17.118us 41.96% 19.966us 3.328us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.118us 41.96% 17.118us 2.853us 6 - Activity Buffer Request 73.01% 1.717ms 73.01% 1.717ms 1.717ms 2.848us 6.98% 2.848us 2.848us 1 - aten::empty_strided 1.36% 31.912us 1.36% 31.912us 5.319us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 10.11% 237.764us 10.11% 237.764us 39.627us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.35% 31.810us 1.74% 40.800us 3.400us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.38% 8.990us 0.38% 8.990us 0.749us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 1.62% 38.102us 1.62% 38.102us 6.350us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.21% 4.871us 0.21% 4.871us 4.871us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 324.125us 795.09% 324.125us 324.125us 1 + hf_kernels_rotary 6.45% 153.782us 99.79% 2.380ms 2.380ms 0.000us 0.00% 43.614us 43.614us 1 + _rotary_dba7d1e::apply_rotary 1.81% 43.080us 3.50% 83.561us 13.927us 23.646us 58.00% 23.646us 3.941us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 23.646us 58.00% 23.646us 3.941us 6 + aten::clone 0.86% 20.470us 88.19% 2.103ms 350.526us 0.000us 0.00% 19.968us 3.328us 6 + aten::copy_ 1.45% 34.599us 85.96% 2.050ms 341.697us 17.120us 42.00% 19.968us 3.328us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.120us 42.00% 17.120us 2.853us 6 + Activity Buffer Request 74.86% 1.785ms 74.86% 1.785ms 1.785ms 2.848us 6.99% 2.848us 2.848us 1 + aten::empty_strided 1.36% 32.503us 1.36% 32.503us 5.417us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 9.65% 230.194us 9.65% 230.194us 38.366us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.31% 31.171us 1.65% 39.351us 3.279us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.34% 8.180us 0.34% 8.180us 0.682us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 1.70% 40.481us 1.70% 40.481us 6.747us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.21% 5.060us 0.21% 5.060us 5.060us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.351ms -Self CUDA time total: 40.798us +Self CPU time total: 2.385ms +Self CUDA time total: 40.766us @@ -4191,23 +4191,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 336.387us 451.94% 336.387us 336.387us 1 - hf_kernels_rotary 7.84% 184.420us 99.78% 2.346ms 2.346ms 0.000us 0.00% 82.976us 82.976us 1 - aten::clone 1.21% 28.560us 86.97% 2.045ms 340.779us 0.000us 0.00% 43.553us 7.259us 6 - aten::copy_ 1.54% 36.092us 84.34% 1.983ms 330.495us 35.009us 47.03% 43.553us 7.259us 6 - _rotary_dba7d1e::apply_rotary 1.67% 39.331us 3.28% 77.091us 12.849us 39.423us 52.97% 39.423us 6.571us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 39.423us 52.97% 39.423us 6.571us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 35.009us 47.03% 35.009us 5.835us 6 - Activity Buffer Request 73.02% 1.717ms 73.02% 1.717ms 1.717ms 8.544us 11.48% 8.544us 8.544us 1 - aten::empty_strided 1.41% 33.141us 1.41% 33.141us 5.523us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 9.79% 230.064us 9.79% 230.064us 38.344us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.34% 31.492us 1.69% 39.832us 3.319us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.35% 8.340us 0.35% 8.340us 0.695us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 1.61% 37.760us 1.61% 37.760us 6.293us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.22% 5.070us 0.22% 5.070us 5.070us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 326.719us 435.58% 326.719us 326.719us 1 + hf_kernels_rotary 6.86% 156.773us 99.78% 2.282ms 2.282ms 0.000us 0.00% 83.583us 83.583us 1 + aten::clone 0.90% 20.639us 87.67% 2.005ms 334.152us 0.000us 0.00% 44.415us 7.402us 6 + aten::copy_ 1.46% 33.361us 85.45% 1.954ms 325.685us 35.839us 47.78% 44.415us 7.402us 6 + _rotary_dba7d1e::apply_rotary 1.78% 40.770us 3.52% 80.530us 13.422us 39.168us 52.22% 39.168us 6.528us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 39.168us 52.22% 39.168us 6.528us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 35.839us 47.78% 35.839us 5.973us 6 + Activity Buffer Request 74.00% 1.692ms 74.00% 1.692ms 1.692ms 8.576us 11.43% 8.576us 8.576us 1 + aten::empty_strided 1.32% 30.160us 1.32% 30.160us 5.027us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 9.99% 228.424us 9.99% 228.424us 38.071us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.36% 31.101us 1.73% 39.632us 3.303us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.37% 8.531us 0.37% 8.531us 0.711us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 1.74% 39.760us 1.74% 39.760us 6.627us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.22% 5.030us 0.22% 5.030us 5.030us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.351ms -Self CUDA time total: 74.432us +Self CPU time total: 2.287ms +Self CUDA time total: 75.007us @@ -4217,23 +4217,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 334.720us 824.27% 334.720us 334.720us 1 - hf_kernels_rotary 7.69% 178.052us 99.76% 2.310ms 2.310ms 0.000us 0.00% 43.488us 43.488us 1 - _rotary_dba7d1e::apply_rotary 1.77% 40.921us 3.42% 79.272us 13.212us 23.680us 58.31% 23.680us 3.947us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 23.680us 58.31% 23.680us 3.947us 6 - aten::clone 1.23% 28.463us 86.92% 2.013ms 335.521us 0.000us 0.00% 19.808us 3.301us 6 - aten::copy_ 1.52% 35.247us 84.34% 1.953ms 325.533us 16.928us 41.69% 19.808us 3.301us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.928us 41.69% 16.928us 2.821us 6 - Activity Buffer Request 73.01% 1.691ms 73.01% 1.691ms 1.691ms 2.880us 7.09% 2.880us 2.880us 1 - aten::empty_strided 1.36% 31.460us 1.36% 31.460us 5.243us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 9.81% 227.126us 9.81% 227.126us 37.854us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.37% 31.801us 1.73% 40.020us 3.335us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.35% 8.219us 0.35% 8.219us 0.685us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 1.66% 38.351us 1.66% 38.351us 6.392us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.24% 5.500us 0.24% 5.500us 5.500us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 313.534us 775.13% 313.534us 313.534us 1 + hf_kernels_rotary 17.26% 147.172us 99.40% 847.604us 847.604us 0.000us 0.00% 43.297us 43.297us 1 + _rotary_dba7d1e::apply_rotary 4.54% 38.680us 9.23% 78.702us 13.117us 23.425us 57.91% 23.425us 3.904us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 23.425us 57.91% 23.425us 3.904us 6 + aten::clone 2.34% 19.949us 68.60% 584.900us 97.483us 0.000us 0.00% 19.872us 3.312us 6 + aten::copy_ 3.97% 33.871us 62.79% 535.360us 89.227us 17.024us 42.09% 19.872us 3.312us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.024us 42.09% 17.024us 2.837us 6 + Activity Buffer Request 32.38% 276.114us 32.38% 276.114us 276.114us 2.848us 7.04% 2.848us 2.848us 1 + aten::empty_strided 3.47% 29.591us 3.47% 29.591us 4.932us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 26.43% 225.375us 26.43% 225.375us 37.563us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.45% 29.390us 4.32% 36.830us 3.069us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.87% 7.440us 0.87% 7.440us 0.620us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 4.69% 40.022us 4.69% 40.022us 6.670us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.60% 5.080us 0.60% 5.080us 5.080us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.316ms -Self CUDA time total: 40.608us +Self CPU time total: 852.684us +Self CUDA time total: 40.449us @@ -4243,23 +4243,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 343.357us 451.99% 343.357us 343.357us 1 - hf_kernels_rotary 7.23% 182.803us 99.81% 2.522ms 2.522ms 0.000us 0.00% 85.341us 85.341us 1 - aten::clone 1.16% 29.441us 87.88% 2.221ms 370.131us 0.000us 0.00% 46.013us 7.669us 6 - aten::copy_ 1.42% 35.932us 85.39% 2.158ms 359.654us 36.637us 48.23% 46.013us 7.669us 6 - _rotary_dba7d1e::apply_rotary 1.58% 39.950us 3.09% 78.111us 13.018us 39.328us 51.77% 39.328us 6.555us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 39.328us 51.77% 39.328us 6.555us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 36.637us 48.23% 36.637us 6.106us 6 - Activity Buffer Request 75.16% 1.899ms 75.16% 1.899ms 1.899ms 9.376us 12.34% 9.376us 9.376us 1 - aten::empty_strided 1.32% 33.420us 1.32% 33.420us 5.570us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 8.81% 222.633us 8.81% 222.633us 37.105us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.25% 31.613us 1.61% 40.701us 3.392us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.36% 9.088us 0.36% 9.088us 0.757us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 1.51% 38.161us 1.51% 38.161us 6.360us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.19% 4.790us 0.19% 4.790us 4.790us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 317.820us 416.43% 317.820us 317.820us 1 + hf_kernels_rotary 17.56% 148.193us 99.41% 838.994us 838.994us 0.000us 0.00% 85.633us 85.633us 1 + aten::clone 2.33% 19.639us 67.72% 571.550us 95.258us 0.000us 0.00% 46.146us 7.691us 6 + aten::copy_ 4.11% 34.662us 61.88% 522.280us 87.047us 36.833us 48.26% 46.146us 7.691us 6 + _rotary_dba7d1e::apply_rotary 4.71% 39.770us 9.52% 80.371us 13.395us 39.487us 51.74% 39.487us 6.581us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 39.487us 51.74% 39.487us 6.581us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 36.833us 48.26% 36.833us 6.139us 6 + Activity Buffer Request 31.55% 266.304us 31.55% 266.304us 266.304us 9.313us 12.20% 9.313us 9.313us 1 + aten::empty_strided 3.51% 29.631us 3.51% 29.631us 4.938us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 26.22% 221.314us 26.22% 221.314us 36.886us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.67% 30.998us 4.61% 38.880us 3.240us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.93% 7.882us 0.93% 7.882us 0.657us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 4.81% 40.601us 4.81% 40.601us 6.767us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.59% 5.020us 0.59% 5.020us 5.020us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.527ms -Self CUDA time total: 75.965us +Self CPU time total: 844.014us +Self CUDA time total: 76.320us @@ -4269,23 +4269,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 335.487us 241.29% 335.487us 335.487us 1 - hf_kernels_rotary 7.48% 174.562us 99.79% 2.329ms 2.329ms 0.000us 0.00% 162.718us 162.718us 1 - aten::clone 1.24% 29.010us 87.24% 2.036ms 339.299us 0.000us 0.00% 102.494us 17.082us 6 - aten::copy_ 1.51% 35.312us 84.60% 1.974ms 329.037us 78.815us 56.69% 102.494us 17.082us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 78.815us 56.69% 78.815us 13.136us 6 - _rotary_dba7d1e::apply_rotary 1.71% 39.800us 3.37% 78.741us 13.124us 60.224us 43.31% 60.224us 10.037us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 60.224us 43.31% 60.224us 10.037us 6 - Activity Buffer Request 73.92% 1.725ms 73.92% 1.725ms 1.725ms 23.679us 17.03% 23.679us 23.679us 1 - aten::empty_strided 1.40% 32.561us 1.40% 32.561us 5.427us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 9.17% 213.963us 9.17% 213.963us 35.660us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.33% 31.050us 1.69% 39.471us 3.289us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.36% 8.421us 0.36% 8.421us 0.702us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 1.67% 38.941us 1.67% 38.941us 6.490us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.21% 4.971us 0.21% 4.971us 4.971us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 319.423us 229.00% 319.423us 319.423us 1 + hf_kernels_rotary 18.34% 146.203us 99.38% 792.173us 792.173us 0.000us 0.00% 163.169us 163.169us 1 + aten::clone 2.42% 19.310us 66.03% 526.340us 87.723us 0.000us 0.00% 102.753us 17.126us 6 + aten::copy_ 4.16% 33.151us 59.92% 477.629us 79.605us 79.073us 56.69% 102.753us 17.126us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 79.073us 56.69% 79.073us 13.179us 6 + _rotary_dba7d1e::apply_rotary 5.00% 39.859us 10.11% 80.600us 13.433us 60.416us 43.31% 60.416us 10.069us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 60.416us 43.31% 60.416us 10.069us 6 + Activity Buffer Request 28.37% 226.154us 28.37% 226.154us 226.154us 23.680us 16.98% 23.680us 23.680us 1 + aten::empty_strided 3.69% 29.401us 3.69% 29.401us 4.900us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 27.39% 218.324us 27.39% 218.324us 36.387us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.87% 30.870us 4.90% 39.030us 3.252us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 1.02% 8.160us 1.02% 8.160us 0.680us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 5.11% 40.741us 5.11% 40.741us 6.790us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.62% 4.920us 0.62% 4.920us 4.920us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.334ms -Self CUDA time total: 139.039us +Self CPU time total: 797.093us +Self CUDA time total: 139.489us @@ -4295,23 +4295,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 13.11% 152.482us 70.07% 814.833us 814.833us 0.000us 0.00% 767.862us 767.862us 1 - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 709.398us 101.13% 709.398us 709.398us 1 - aten::clone 1.92% 22.371us 46.79% 544.150us 90.692us 0.000us 0.00% 567.671us 94.612us 6 - aten::copy_ 3.06% 35.584us 42.24% 491.229us 81.872us 501.304us 71.46% 567.671us 94.612us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 501.304us 71.46% 501.304us 83.551us 6 - _rotary_dba7d1e::apply_rotary 3.52% 40.960us 6.87% 79.901us 13.317us 200.191us 28.54% 200.191us 33.365us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 200.191us 28.54% 200.191us 33.365us 6 - Activity Buffer Request 20.99% 244.144us 20.99% 244.144us 244.144us 66.367us 9.46% 66.367us 66.367us 1 - aten::empty_strided 2.63% 30.550us 2.63% 30.550us 5.092us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 18.19% 211.501us 18.19% 211.501us 35.250us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.57% 29.881us 3.29% 38.300us 3.192us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.72% 8.419us 0.72% 8.419us 0.702us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 3.35% 38.941us 3.35% 38.941us 6.490us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 29.93% 348.096us 29.93% 348.096us 348.096us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 12.16% 146.839us 70.39% 850.223us 850.223us 0.000us 0.00% 769.020us 769.020us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 710.970us 101.11% 710.970us 710.970us 1 + aten::clone 3.16% 38.139us 48.21% 582.349us 97.058us 0.000us 0.00% 567.581us 94.597us 6 + aten::copy_ 2.68% 32.339us 42.57% 514.168us 85.695us 501.725us 71.35% 567.581us 94.597us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 501.725us 71.35% 501.725us 83.621us 6 + _rotary_dba7d1e::apply_rotary 3.42% 41.301us 6.76% 81.602us 13.600us 201.439us 28.65% 201.439us 33.573us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 201.439us 28.65% 201.439us 33.573us 6 + Activity Buffer Request 21.65% 261.455us 21.65% 261.455us 261.455us 65.856us 9.37% 65.856us 65.856us 1 + aten::empty_strided 2.49% 30.042us 2.49% 30.042us 5.007us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 18.25% 220.374us 18.25% 220.374us 36.729us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.57% 30.999us 3.26% 39.433us 3.286us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.70% 8.434us 0.70% 8.434us 0.703us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 3.34% 40.301us 3.34% 40.301us 6.717us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 29.61% 357.615us 29.61% 357.615us 357.615us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.163ms -Self CUDA time total: 701.495us +Self CPU time total: 1.208ms +Self CUDA time total: 703.164us @@ -4321,23 +4321,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 326.016us 1225.99% 326.016us 326.016us 1 - hf_kernels_rotary 18.50% 152.323us 99.40% 818.663us 818.663us 0.000us 0.00% 27.904us 27.904us 1 - _rotary_dba7d1e::apply_rotary 4.86% 40.039us 9.57% 78.850us 13.142us 18.752us 70.52% 18.752us 3.125us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 18.752us 70.52% 18.752us 3.125us 6 - aten::clone 2.56% 21.061us 66.62% 548.640us 91.440us 0.000us 0.00% 9.152us 1.525us 6 - aten::copy_ 4.19% 34.519us 60.27% 496.387us 82.731us 7.840us 29.48% 9.152us 1.525us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.840us 29.48% 7.840us 1.307us 6 - Activity Buffer Request 29.97% 246.784us 29.97% 246.784us 246.784us 1.312us 4.93% 1.312us 1.312us 1 - aten::empty_strided 3.79% 31.192us 3.79% 31.192us 5.199us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 26.12% 215.084us 26.12% 215.084us 35.847us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 3.71% 30.531us 4.72% 38.850us 3.237us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.01% 8.319us 1.01% 8.319us 0.693us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 4.71% 38.811us 4.71% 38.811us 6.469us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.60% 4.910us 0.60% 4.910us 4.910us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 317.658us 1193.13% 317.658us 317.658us 1 + hf_kernels_rotary 17.50% 146.563us 99.40% 832.443us 832.443us 0.000us 0.00% 27.968us 27.968us 1 + _rotary_dba7d1e::apply_rotary 4.88% 40.860us 9.91% 82.980us 13.830us 18.751us 70.43% 18.751us 3.125us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 18.751us 70.43% 18.751us 3.125us 6 + aten::clone 2.35% 19.662us 67.40% 564.410us 94.068us 0.000us 0.00% 9.217us 1.536us 6 + aten::copy_ 4.08% 34.181us 61.36% 513.878us 85.646us 7.873us 29.57% 9.217us 1.536us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.873us 29.57% 7.873us 1.312us 6 + Activity Buffer Request 31.10% 260.474us 31.10% 260.474us 260.474us 1.344us 5.05% 1.344us 1.344us 1 + aten::empty_strided 3.69% 30.870us 3.69% 30.870us 5.145us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 26.18% 219.223us 26.18% 219.223us 36.537us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.65% 30.540us 4.60% 38.490us 3.208us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.95% 7.950us 0.95% 7.950us 0.663us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 5.03% 42.120us 5.03% 42.120us 7.020us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.60% 5.020us 0.60% 5.020us 5.020us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 823.573us -Self CUDA time total: 26.592us +Self CPU time total: 837.463us +Self CUDA time total: 26.624us @@ -4347,23 +4347,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 323.263us 1209.82% 323.263us 323.263us 1 - hf_kernels_rotary 17.52% 147.623us 99.42% 837.623us 837.623us 0.000us 0.00% 28.032us 28.032us 1 - _rotary_dba7d1e::apply_rotary 4.62% 38.930us 9.25% 77.941us 12.990us 18.944us 70.90% 18.944us 3.157us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 18.944us 70.90% 18.944us 3.157us 6 - aten::clone 2.83% 23.880us 68.02% 573.009us 95.502us 0.000us 0.00% 9.088us 1.515us 6 - aten::copy_ 4.05% 34.160us 61.53% 518.397us 86.400us 7.776us 29.10% 9.088us 1.515us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.776us 29.10% 7.776us 1.296us 6 - Activity Buffer Request 32.41% 273.024us 32.41% 273.024us 273.024us 1.312us 4.91% 1.312us 1.312us 1 - aten::empty_strided 3.65% 30.732us 3.65% 30.732us 5.122us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 25.07% 211.213us 25.07% 211.213us 35.202us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 3.65% 30.720us 4.64% 39.050us 3.254us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.99% 8.330us 0.99% 8.330us 0.694us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 4.63% 39.011us 4.63% 39.011us 6.502us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.58% 4.850us 0.58% 4.850us 4.850us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 308.930us 1157.56% 308.930us 308.930us 1 + hf_kernels_rotary 17.42% 145.421us 99.38% 829.543us 829.543us 0.000us 0.00% 28.000us 28.000us 1 + _rotary_dba7d1e::apply_rotary 5.67% 47.303us 10.60% 88.453us 14.742us 18.944us 70.98% 18.944us 3.157us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 18.944us 70.98% 18.944us 3.157us 6 + aten::clone 2.28% 19.028us 66.78% 557.449us 92.908us 0.000us 0.00% 9.056us 1.509us 6 + aten::copy_ 3.82% 31.872us 61.07% 509.750us 84.958us 7.744us 29.02% 9.056us 1.509us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.744us 29.02% 7.744us 1.291us 6 + Activity Buffer Request 31.42% 262.245us 31.42% 262.245us 262.245us 1.312us 4.92% 1.312us 1.312us 1 + aten::empty_strided 3.43% 28.671us 3.43% 28.671us 4.778us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 25.83% 215.633us 25.83% 215.633us 35.939us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.59% 29.990us 4.58% 38.220us 3.185us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.99% 8.230us 0.99% 8.230us 0.686us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 4.93% 41.150us 4.93% 41.150us 6.858us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.62% 5.211us 0.62% 5.211us 5.211us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 842.473us -Self CUDA time total: 26.720us +Self CPU time total: 834.754us +Self CUDA time total: 26.688us @@ -4373,23 +4373,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 317.947us 1037.18% 317.947us 317.947us 1 - hf_kernels_rotary 18.00% 147.321us 99.35% 812.963us 812.963us 0.000us 0.00% 32.383us 32.383us 1 - _rotary_dba7d1e::apply_rotary 4.88% 39.901us 9.44% 77.251us 12.875us 20.255us 66.07% 20.255us 3.376us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 20.255us 66.07% 20.255us 3.376us 6 - aten::clone 2.41% 19.693us 67.19% 549.781us 91.630us 0.000us 0.00% 12.128us 2.021us 6 - aten::copy_ 4.28% 35.023us 61.13% 500.160us 83.360us 10.400us 33.93% 12.128us 2.021us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.400us 33.93% 10.400us 1.733us 6 - Activity Buffer Request 31.00% 253.664us 31.00% 253.664us 253.664us 1.728us 5.64% 1.728us 1.728us 1 - aten::empty_strided 3.66% 29.928us 3.66% 29.928us 4.988us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 25.84% 211.473us 25.84% 211.473us 35.245us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 3.72% 30.411us 4.72% 38.610us 3.218us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.00% 8.199us 1.00% 8.199us 0.683us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 4.56% 37.350us 4.56% 37.350us 6.225us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.65% 5.289us 0.65% 5.289us 5.289us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 326.626us 1069.89% 326.626us 326.626us 1 + hf_kernels_rotary 6.56% 153.551us 99.79% 2.337ms 2.337ms 0.000us 0.00% 32.289us 32.289us 1 + _rotary_dba7d1e::apply_rotary 1.75% 41.061us 3.50% 81.981us 13.664us 20.161us 66.04% 20.161us 3.360us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 20.161us 66.04% 20.161us 3.360us 6 + aten::clone 0.92% 21.442us 88.00% 2.061ms 343.521us 0.000us 0.00% 12.128us 2.021us 6 + aten::copy_ 1.46% 34.178us 85.80% 2.010ms 334.939us 10.368us 33.96% 12.128us 2.021us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.368us 33.96% 10.368us 1.728us 6 + Activity Buffer Request 75.10% 1.759ms 75.10% 1.759ms 1.759ms 1.760us 5.77% 1.760us 1.760us 1 + aten::empty_strided 1.28% 30.051us 1.28% 30.051us 5.009us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 9.25% 216.555us 9.25% 216.555us 36.092us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.37% 32.001us 1.73% 40.601us 3.383us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.37% 8.600us 0.37% 8.600us 0.717us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 1.75% 40.920us 1.75% 40.920us 6.820us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.21% 4.951us 0.21% 4.951us 4.951us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 818.252us -Self CUDA time total: 30.655us +Self CPU time total: 2.342ms +Self CUDA time total: 30.529us @@ -4399,22 +4399,22 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 331.769us 777.76% 331.769us 331.769us 1 - hf_kernels_rotary 19.70% 168.549us 99.44% 850.864us 850.864us 0.000us 0.00% 45.537us 45.537us 1 - _rotary_dba7d1e::apply_rotary 4.73% 40.431us 9.19% 78.662us 13.110us 25.697us 60.24% 25.697us 4.283us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 25.697us 60.24% 25.697us 4.283us 6 - aten::clone 2.97% 25.433us 65.78% 562.881us 93.814us 0.000us 0.00% 19.840us 3.307us 6 - aten::copy_ 4.23% 36.170us 59.14% 506.068us 84.345us 16.960us 39.76% 19.840us 3.307us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.960us 39.76% 16.960us 2.827us 6 - Activity Buffer Request 30.43% 260.334us 30.43% 260.334us 260.334us 2.880us 6.75% 2.880us 2.880us 1 - aten::empty_strided 3.67% 31.380us 3.67% 31.380us 5.230us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 24.49% 209.564us 24.49% 209.564us 34.927us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 3.75% 32.092us 4.77% 40.772us 3.398us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.01% 8.680us 1.01% 8.680us 0.723us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 4.47% 38.231us 4.47% 38.231us 6.372us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.56% 4.789us 0.56% 4.789us 4.789us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 323.551us 758.49% 323.551us 323.551us 1 + hf_kernels_rotary 6.57% 149.229us 99.78% 2.268ms 2.268ms 0.000us 0.00% 45.505us 45.505us 1 + _rotary_dba7d1e::apply_rotary 1.78% 40.461us 3.63% 82.552us 13.759us 25.601us 60.02% 25.601us 4.267us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 25.601us 60.02% 25.601us 4.267us 6 + aten::clone 0.92% 20.920us 87.81% 1.996ms 332.671us 0.000us 0.00% 19.904us 3.317us 6 + aten::copy_ 1.45% 33.001us 85.55% 1.945ms 324.092us 17.056us 39.98% 19.904us 3.317us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.056us 39.98% 17.056us 2.843us 6 + Activity Buffer Request 74.47% 1.693ms 74.47% 1.693ms 1.693ms 2.848us 6.68% 2.848us 2.848us 1 + aten::empty_strided 1.34% 30.551us 1.34% 30.551us 5.092us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 9.62% 218.764us 9.62% 218.764us 36.461us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.40% 31.822us 1.77% 40.192us 3.349us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.37% 8.370us 0.37% 8.370us 0.697us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 1.85% 42.091us 1.85% 42.091us 7.015us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.22% 5.030us 0.22% 5.030us 5.030us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 855.653us +Self CPU time total: 2.273ms Self CUDA time total: 42.657us @@ -4425,23 +4425,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 324.568us 1058.74% 324.568us 324.568us 1 - hf_kernels_rotary 19.85% 169.202us 99.36% 847.094us 847.094us 0.000us 0.00% 32.384us 32.384us 1 - _rotary_dba7d1e::apply_rotary 4.69% 39.959us 9.27% 78.991us 13.165us 20.352us 66.39% 20.352us 3.392us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 20.352us 66.39% 20.352us 3.392us 6 - aten::clone 2.92% 24.890us 65.73% 560.410us 93.402us 0.000us 0.00% 12.032us 2.005us 6 - aten::copy_ 4.20% 35.769us 59.19% 504.659us 84.110us 10.304us 33.61% 12.032us 2.005us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.304us 33.61% 10.304us 1.717us 6 - Activity Buffer Request 30.61% 260.975us 30.61% 260.975us 260.975us 1.728us 5.64% 1.728us 1.728us 1 - aten::empty_strided 3.62% 30.861us 3.62% 30.861us 5.143us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 24.39% 207.915us 24.39% 207.915us 34.652us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 3.54% 30.221us 4.51% 38.491us 3.208us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.97% 8.270us 0.97% 8.270us 0.689us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 4.58% 39.032us 4.58% 39.032us 6.505us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.64% 5.460us 0.64% 5.460us 5.460us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 313.528us 1024.90% 313.528us 313.528us 1 + hf_kernels_rotary 17.21% 144.859us 99.41% 836.674us 836.674us 0.000us 0.00% 32.319us 32.319us 1 + _rotary_dba7d1e::apply_rotary 4.77% 40.179us 9.63% 81.081us 13.513us 20.224us 66.11% 20.224us 3.371us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 20.224us 66.11% 20.224us 3.371us 6 + aten::clone 2.34% 19.722us 68.09% 573.072us 95.512us 0.000us 0.00% 12.095us 2.016us 6 + aten::copy_ 3.82% 32.139us 62.12% 522.779us 87.130us 10.367us 33.89% 12.095us 2.016us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.367us 33.89% 10.367us 1.728us 6 + Activity Buffer Request 32.59% 274.305us 32.59% 274.305us 274.305us 1.728us 5.65% 1.728us 1.728us 1 + aten::empty_strided 3.63% 30.571us 3.63% 30.571us 5.095us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 25.70% 216.335us 25.70% 216.335us 36.056us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.51% 29.563us 4.47% 37.662us 3.139us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.96% 8.099us 0.96% 8.099us 0.675us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 4.86% 40.902us 4.86% 40.902us 6.817us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.59% 4.950us 0.59% 4.950us 4.950us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 852.554us -Self CUDA time total: 30.656us +Self CPU time total: 841.624us +Self CUDA time total: 30.591us @@ -4451,23 +4451,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 328.702us 766.04% 328.702us 328.702us 1 - hf_kernels_rotary 18.09% 152.853us 99.33% 839.363us 839.363us 0.000us 0.00% 45.788us 45.788us 1 - _rotary_dba7d1e::apply_rotary 4.68% 39.541us 9.21% 77.782us 12.964us 25.887us 60.33% 25.887us 4.314us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 25.887us 60.33% 25.887us 4.314us 6 - aten::clone 2.66% 22.468us 67.35% 569.108us 94.851us 0.000us 0.00% 19.901us 3.317us 6 - aten::copy_ 4.16% 35.173us 60.88% 514.450us 85.742us 17.022us 39.67% 19.901us 3.317us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.022us 39.67% 17.022us 2.837us 6 - Activity Buffer Request 32.07% 270.965us 32.07% 270.965us 270.965us 2.879us 6.71% 2.879us 2.879us 1 - aten::empty_strided 3.81% 32.190us 3.81% 32.190us 5.365us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 24.65% 208.312us 24.65% 208.312us 34.719us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 3.71% 31.390us 4.69% 39.620us 3.302us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.97% 8.230us 0.97% 8.230us 0.686us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 4.53% 38.241us 4.53% 38.241us 6.374us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.67% 5.631us 0.67% 5.631us 5.631us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 315.863us 736.07% 315.863us 315.863us 1 + hf_kernels_rotary 17.12% 141.480us 99.37% 821.243us 821.243us 0.000us 0.00% 45.760us 45.760us 1 + _rotary_dba7d1e::apply_rotary 4.88% 40.322us 9.94% 82.183us 13.697us 25.824us 60.18% 25.824us 4.304us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 25.824us 60.18% 25.824us 4.304us 6 + aten::clone 2.37% 19.588us 67.68% 559.279us 93.213us 0.000us 0.00% 19.936us 3.323us 6 + aten::copy_ 4.15% 34.293us 61.65% 509.500us 84.917us 17.088us 39.82% 19.936us 3.323us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.088us 39.82% 17.088us 2.848us 6 + Activity Buffer Request 31.27% 258.454us 31.27% 258.454us 258.454us 2.848us 6.64% 2.848us 2.848us 1 + aten::empty_strided 3.65% 30.191us 3.65% 30.191us 5.032us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 26.23% 216.753us 26.23% 216.753us 36.126us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.65% 30.130us 4.63% 38.301us 3.192us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.99% 8.171us 0.99% 8.171us 0.681us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 5.07% 41.861us 5.07% 41.861us 6.977us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.63% 5.170us 0.63% 5.170us 5.170us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 844.994us -Self CUDA time total: 42.909us +Self CPU time total: 826.413us +Self CUDA time total: 42.912us @@ -4477,23 +4477,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 337.246us 364.66% 337.246us 337.246us 1 - hf_kernels_rotary 7.43% 178.431us 99.78% 2.398ms 2.398ms 0.000us 0.00% 107.425us 107.425us 1 - aten::clone 1.14% 27.439us 87.31% 2.098ms 349.642us 0.000us 0.00% 65.823us 10.970us 6 - aten::copy_ 1.39% 33.333us 84.85% 2.039ms 339.779us 50.880us 55.02% 65.823us 10.970us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 50.880us 55.02% 50.880us 8.480us 6 - _rotary_dba7d1e::apply_rotary 1.70% 40.740us 3.29% 79.070us 13.178us 41.602us 44.98% 41.602us 6.934us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 41.602us 44.98% 41.602us 6.934us 6 - Activity Buffer Request 74.72% 1.795ms 74.72% 1.795ms 1.795ms 14.943us 16.16% 14.943us 14.943us 1 - aten::empty_strided 1.32% 31.741us 1.32% 31.741us 5.290us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 8.74% 209.903us 8.74% 209.903us 34.984us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.35% 32.344us 1.76% 42.183us 3.515us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.41% 9.839us 0.41% 9.839us 0.820us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 1.60% 38.330us 1.60% 38.330us 6.388us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.22% 5.280us 0.22% 5.280us 5.280us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 311.420us 333.98% 311.420us 311.420us 1 + hf_kernels_rotary 17.30% 140.163us 99.43% 805.353us 805.353us 0.000us 0.00% 108.190us 108.190us 1 + aten::clone 2.32% 18.760us 67.64% 547.879us 91.313us 0.000us 0.00% 66.399us 11.066us 6 + aten::copy_ 4.25% 34.400us 61.74% 500.059us 83.343us 51.455us 55.18% 66.399us 11.066us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 51.455us 55.18% 51.455us 8.576us 6 + _rotary_dba7d1e::apply_rotary 4.76% 38.581us 9.84% 79.710us 13.285us 41.791us 44.82% 41.791us 6.965us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 41.791us 44.82% 41.791us 6.965us 6 + Activity Buffer Request 31.09% 251.814us 31.09% 251.814us 251.814us 14.944us 16.03% 14.944us 14.944us 1 + aten::empty_strided 3.59% 29.060us 3.59% 29.060us 4.843us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 26.40% 213.845us 26.40% 213.845us 35.641us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.70% 29.950us 4.64% 37.601us 3.133us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.94% 7.651us 0.94% 7.651us 0.638us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 5.08% 41.129us 5.08% 41.129us 6.855us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.57% 4.640us 0.57% 4.640us 4.640us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.403ms -Self CUDA time total: 92.482us +Self CPU time total: 809.993us +Self CUDA time total: 93.246us @@ -4503,23 +4503,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 331.357us 227.98% 331.357us 331.357us 1 - hf_kernels_rotary 19.22% 153.403us 99.38% 793.253us 793.253us 0.000us 0.00% 169.054us 169.054us 1 - aten::clone 2.47% 19.681us 65.33% 521.479us 86.913us 0.000us 0.00% 105.151us 17.525us 6 - aten::copy_ 4.41% 35.219us 59.11% 471.788us 78.631us 81.439us 56.03% 105.151us 17.525us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 81.439us 56.03% 81.439us 13.573us 6 - _rotary_dba7d1e::apply_rotary 5.09% 40.640us 9.93% 79.270us 13.212us 63.903us 43.97% 63.903us 10.650us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 63.903us 43.97% 63.903us 10.650us 6 - Activity Buffer Request 29.11% 232.364us 29.11% 232.364us 232.364us 23.712us 16.31% 23.712us 23.712us 1 - aten::empty_strided 3.76% 30.010us 3.76% 30.010us 5.002us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 25.58% 204.205us 25.58% 204.205us 34.034us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 3.78% 30.171us 4.90% 39.101us 3.258us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.12% 8.930us 1.12% 8.930us 0.744us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 4.84% 38.630us 4.84% 38.630us 6.438us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.62% 4.940us 0.62% 4.940us 4.940us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 314.038us 216.45% 314.038us 314.038us 1 + hf_kernels_rotary 17.88% 139.471us 99.38% 775.432us 775.432us 0.000us 0.00% 168.795us 168.795us 1 + aten::clone 2.37% 18.470us 66.41% 518.157us 86.360us 0.000us 0.00% 104.989us 17.498us 6 + aten::copy_ 4.24% 33.100us 60.11% 469.037us 78.173us 81.277us 56.02% 104.989us 17.498us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 81.277us 56.02% 81.277us 13.546us 6 + _rotary_dba7d1e::apply_rotary 5.14% 40.111us 10.39% 81.071us 13.512us 63.806us 43.98% 63.806us 10.634us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 63.806us 43.98% 63.806us 10.634us 6 + Activity Buffer Request 28.68% 223.784us 28.68% 223.784us 223.784us 23.712us 16.34% 23.712us 23.712us 1 + aten::empty_strided 3.93% 30.650us 3.93% 30.650us 5.108us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 27.19% 212.153us 27.19% 212.153us 35.359us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.73% 29.133us 4.71% 36.733us 3.061us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.97% 7.600us 0.97% 7.600us 0.633us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 5.25% 40.960us 5.25% 40.960us 6.827us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.62% 4.811us 0.62% 4.811us 4.811us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 798.193us -Self CUDA time total: 145.342us +Self CPU time total: 780.243us +Self CUDA time total: 145.083us @@ -4529,23 +4529,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 327.384us 410.23% 327.384us 327.384us 1 - hf_kernels_rotary 18.75% 148.421us 99.39% 786.852us 786.852us 0.000us 0.00% 89.981us 89.981us 1 - aten::clone 2.67% 21.153us 65.81% 521.010us 86.835us 0.000us 0.00% 47.613us 7.935us 6 - aten::copy_ 4.62% 36.560us 59.19% 468.587us 78.098us 37.437us 46.91% 47.613us 7.935us 6 - _rotary_dba7d1e::apply_rotary 5.10% 40.369us 9.95% 78.790us 13.132us 42.368us 53.09% 42.368us 7.061us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 42.368us 53.09% 42.368us 7.061us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 37.437us 46.91% 37.437us 6.240us 6 - Activity Buffer Request 28.86% 228.474us 28.86% 228.474us 228.474us 10.176us 12.75% 10.176us 10.176us 1 - aten::empty_strided 3.95% 31.270us 3.95% 31.270us 5.212us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 25.71% 203.553us 25.71% 203.553us 33.925us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 3.86% 30.542us 4.88% 38.631us 3.219us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.02% 8.089us 1.02% 8.089us 0.674us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 4.85% 38.421us 4.85% 38.421us 6.403us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.61% 4.869us 0.61% 4.869us 4.869us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 319.458us 399.31% 319.458us 319.458us 1 + hf_kernels_rotary 17.47% 143.453us 99.38% 816.213us 816.213us 0.000us 0.00% 90.147us 90.147us 1 + aten::clone 2.34% 19.239us 67.36% 553.208us 92.201us 0.000us 0.00% 47.811us 7.969us 6 + aten::copy_ 3.98% 32.663us 61.19% 502.549us 83.758us 37.666us 47.08% 47.811us 7.969us 6 + _rotary_dba7d1e::apply_rotary 4.95% 40.641us 9.85% 80.901us 13.484us 42.336us 52.92% 42.336us 7.056us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 42.336us 52.92% 42.336us 7.056us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 37.666us 47.08% 37.666us 6.278us 6 + Activity Buffer Request 31.07% 255.204us 31.07% 255.204us 255.204us 10.145us 12.68% 10.145us 10.145us 1 + aten::empty_strided 3.83% 31.420us 3.83% 31.420us 5.237us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 26.14% 214.682us 26.14% 214.682us 35.780us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.75% 30.830us 4.71% 38.651us 3.221us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.95% 7.821us 0.95% 7.821us 0.652us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 4.90% 40.260us 4.90% 40.260us 6.710us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.62% 5.090us 0.62% 5.090us 5.090us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 791.721us -Self CUDA time total: 79.805us +Self CPU time total: 821.303us +Self CUDA time total: 80.002us @@ -4555,23 +4555,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 334.133us 229.04% 334.133us 334.133us 1 - hf_kernels_rotary 18.91% 152.747us 99.33% 802.303us 802.303us 0.000us 0.00% 169.593us 169.593us 1 - aten::clone 2.63% 21.282us 65.81% 531.500us 88.583us 0.000us 0.00% 105.244us 17.541us 6 - aten::copy_ 4.22% 34.070us 59.15% 477.709us 79.618us 81.533us 55.89% 105.244us 17.541us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 81.533us 55.89% 81.533us 13.589us 6 - _rotary_dba7d1e::apply_rotary 4.95% 39.971us 9.71% 78.412us 13.069us 64.349us 44.11% 64.349us 10.725us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 64.349us 44.11% 64.349us 10.725us 6 - Activity Buffer Request 29.92% 241.694us 29.92% 241.694us 241.694us 23.711us 16.25% 23.711us 23.711us 1 - aten::empty_strided 4.02% 32.509us 4.02% 32.509us 5.418us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 25.00% 201.945us 25.00% 201.945us 33.657us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 3.87% 31.225us 4.91% 39.644us 3.304us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.04% 8.419us 1.04% 8.419us 0.702us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 4.76% 38.441us 4.76% 38.441us 6.407us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.67% 5.380us 0.67% 5.380us 5.380us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 325.402us 221.65% 325.402us 325.402us 1 + hf_kernels_rotary 17.64% 145.390us 99.37% 819.093us 819.093us 0.000us 0.00% 170.492us 170.492us 1 + aten::clone 2.33% 19.171us 66.92% 551.610us 91.935us 0.000us 0.00% 105.757us 17.626us 6 + aten::copy_ 4.29% 35.392us 61.01% 502.899us 83.816us 82.077us 55.91% 105.757us 17.626us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 82.077us 55.91% 82.077us 13.679us 6 + _rotary_dba7d1e::apply_rotary 4.98% 41.061us 10.11% 83.363us 13.894us 64.735us 44.09% 64.735us 10.789us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 64.735us 44.09% 64.735us 10.789us 6 + Activity Buffer Request 31.05% 255.945us 31.05% 255.945us 255.945us 23.680us 16.13% 23.680us 23.680us 1 + aten::empty_strided 3.58% 29.540us 3.58% 29.540us 4.923us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 25.67% 211.562us 25.67% 211.562us 35.260us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.74% 30.830us 4.70% 38.730us 3.228us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.96% 7.900us 0.96% 7.900us 0.658us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 5.13% 42.302us 5.13% 42.302us 7.050us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.63% 5.170us 0.63% 5.170us 5.170us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 807.683us -Self CUDA time total: 145.882us +Self CPU time total: 824.263us +Self CUDA time total: 146.812us @@ -4581,23 +4581,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 13.54% 152.254us 71.57% 804.992us 804.992us 0.000us 0.00% 741.111us 741.111us 1 - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 682.359us 101.20% 682.359us 682.359us 1 - aten::clone 1.94% 21.788us 47.45% 533.747us 88.958us 0.000us 0.00% 557.274us 92.879us 6 - aten::copy_ 3.08% 34.611us 42.75% 480.788us 80.131us 490.426us 72.74% 557.274us 92.879us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 490.426us 72.74% 490.426us 81.738us 6 - _rotary_dba7d1e::apply_rotary 3.61% 40.571us 7.01% 78.811us 13.135us 183.837us 27.26% 183.837us 30.639us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 183.837us 27.26% 183.837us 30.639us 6 - Activity Buffer Request 21.83% 245.524us 21.83% 245.524us 245.524us 66.848us 9.91% 66.848us 66.848us 1 - aten::empty_strided 2.77% 31.171us 2.77% 31.171us 5.195us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 17.84% 200.653us 17.84% 200.653us 33.442us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.81% 31.570us 3.57% 40.180us 3.348us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.77% 8.610us 0.77% 8.610us 0.718us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 3.40% 38.240us 3.40% 38.240us 6.373us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 28.43% 319.765us 28.43% 319.765us 319.765us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 12.27% 141.491us 70.82% 816.704us 816.704us 0.000us 0.00% 741.813us 741.813us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 683.383us 101.18% 683.383us 683.383us 1 + aten::clone 1.70% 19.590us 48.29% 556.839us 92.806us 0.000us 0.00% 558.329us 93.055us 6 + aten::copy_ 3.05% 35.121us 43.99% 507.258us 84.543us 491.962us 72.84% 558.329us 93.055us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 491.962us 72.84% 491.962us 81.994us 6 + _rotary_dba7d1e::apply_rotary 3.41% 39.351us 6.95% 80.182us 13.364us 183.484us 27.16% 183.484us 30.581us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 183.484us 27.16% 183.484us 30.581us 6 + Activity Buffer Request 22.79% 262.814us 22.79% 262.814us 262.814us 66.367us 9.83% 66.367us 66.367us 1 + aten::empty_strided 2.60% 29.991us 2.60% 29.991us 4.998us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 18.15% 209.323us 18.15% 209.323us 34.887us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.59% 29.922us 3.31% 38.192us 3.183us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.72% 8.270us 0.72% 8.270us 0.689us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 3.54% 40.831us 3.54% 40.831us 6.805us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 29.18% 336.495us 29.18% 336.495us 336.495us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.125ms -Self CUDA time total: 674.263us +Self CPU time total: 1.153ms +Self CUDA time total: 675.446us @@ -4607,30 +4607,30 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 5.26% 152.407us 28.24% 818.853us 818.853us 0.000us 0.00% 2.611ms 2.611ms 1 - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 2.442ms 100.34% 2.442ms 2.442ms 1 - aten::clone 0.72% 20.941us 18.92% 548.700us 91.450us 0.000us 0.00% 1.390ms 231.619us 6 - aten::copy_ 1.19% 34.511us 17.07% 495.108us 82.518us 1.212ms 49.82% 1.390ms 231.619us 6 - _rotary_dba7d1e::apply_rotary 1.41% 40.761us 2.75% 79.892us 13.315us 1.221ms 50.18% 1.221ms 203.523us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 1.221ms 50.18% 1.221ms 203.523us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 1.212ms 49.82% 1.212ms 202.067us 6 - Activity Buffer Request 8.94% 259.144us 8.94% 259.144us 259.144us 177.311us 7.29% 177.311us 177.311us 1 - aten::empty_strided 1.13% 32.651us 1.13% 32.651us 5.442us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 6.95% 201.453us 6.95% 201.453us 33.575us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.03% 29.842us 1.31% 37.854us 3.154us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.28% 8.012us 0.28% 8.012us 0.668us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 1.35% 39.131us 1.35% 39.131us 6.522us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 71.76% 2.081ms 71.76% 2.081ms 2.081ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 3.40% 150.944us 53.25% 2.364ms 2.364ms 0.000us 0.00% 2.619ms 2.619ms 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 2.449ms 100.31% 2.449ms 2.449ms 1 + aten::clone 0.54% 23.821us 47.05% 2.089ms 348.124us 0.000us 0.00% 1.393ms 232.202us 6 + aten::copy_ 0.77% 34.330us 45.79% 2.033ms 338.776us 1.215ms 49.78% 1.393ms 232.202us 6 + _rotary_dba7d1e::apply_rotary 0.95% 42.082us 1.92% 85.331us 14.222us 1.226ms 50.22% 1.226ms 204.346us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 1.226ms 50.22% 1.226ms 204.346us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 1.215ms 49.78% 1.215ms 202.575us 6 + Activity Buffer Request 40.08% 1.779ms 40.08% 1.779ms 1.779ms 177.759us 7.28% 177.759us 177.759us 1 + aten::empty_strided 0.73% 32.270us 0.73% 32.270us 5.378us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 4.93% 218.903us 4.93% 218.903us 36.484us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 0.70% 31.129us 0.88% 39.000us 3.250us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.18% 7.871us 0.18% 7.871us 0.656us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 0.97% 43.249us 0.97% 43.249us 7.208us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 46.75% 2.075ms 46.75% 2.075ms 2.075ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.900ms -Self CUDA time total: 2.434ms +Self CPU time total: 4.439ms +Self CUDA time total: 2.442ms impl wl p50(ms) ok hf_kernels_rotary cuda_B1_S128_H32_D128_R64 0.09 True hf_kernels_rotary cuda_B1_S128_H32_D64_R32 0.09 True hf_kernels_rotary cuda_B1_S128_H8_D128_R64 0.09 True -hf_kernels_rotary cuda_B1_S128_H8_D64_R32 0.07 True +hf_kernels_rotary cuda_B1_S128_H8_D64_R32 0.08 True hf_kernels_rotary cuda_B1_S2048_H32_D128_R64 0.26 True hf_kernels_rotary cuda_B1_S2048_H32_D64_R32 0.09 True hf_kernels_rotary cuda_B1_S2048_H8_D128_R64 0.09 True @@ -4655,12 +4655,62 @@ hf_kernels_rotary cuda_B2_S512_H8_D64_R32 0.09 True▶ UV Install LogsFetching 5 files: 0%| | 0/5 [00:00<?, ?it/s] -Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 12.23it/s] -Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 12.22it/s]+Fetching 5 files: 40%|████ | 2/5 [00:00<00:00, 19.18it/s]Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads. + +Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 9.18it/s] +Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 9.78it/s]Artifacts:
rotary.jsonl