| {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H8_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07445200003530772, "p50": 0.07589100005134242, "p90": 0.07600200001434132, "mean": 0.0754678000248532, "iqr": 0.0014600000213249587, "raw_times": [0.0764520000302582, 0.07600200001434132, 0.07589100005134242, 0.07454199999301636, 0.07445200003530772], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08018199991965957, "peak_bytes": 3178496, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.590452924915553e-08, "mae_k": 1.5487040982975486e-08, "mse_q": 2.5241010080938753e-15, "mse_k": 2.364223539299626e-15, "ref": "rotary_torch"}, "err": null} | |
| {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H8_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08898300006876525, "p50": 0.09174199999506527, "p90": 0.09300300007453188, "mean": 0.09168480000880663, "iqr": 0.0013200001376389991, "raw_times": [0.09168299993689288, 0.09300300007453188, 0.09174199999506527, 0.09301299996877788, 0.08898300006876525], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09485199984737847, "peak_bytes": 6356992, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5508486939097565e-08, "mae_k": 1.567566698668088e-08, "mse_q": 2.3630110116356316e-15, "mse_k": 2.416562128626943e-15, "ref": "rotary_torch"}, "err": null} | |
| {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H32_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08705300001565774, "p50": 0.09316300020145718, "p90": 0.10223200001746591, "mean": 0.09793460003493237, "iqr": 0.013889999991079094, "raw_times": [0.10223200001746591, 0.09316300020145718, 0.11888299991369422, 0.08834200002638681, 0.08705300001565774], "has_warnings": true, "reps": 5, "warmup": 2}, "compile_ms": 0.09152200004791666, "peak_bytes": 12615680, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5856898016863852e-08, "mae_k": 1.572981211950264e-08, "mse_q": 2.4771055025978386e-15, "mse_k": 2.4544071371937915e-15, "ref": "rotary_torch"}, "err": null} | |
| {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H32_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0873520000368444, "p50": 0.08926300006351084, "p90": 0.08946200000536919, "mean": 0.08885220004231087, "iqr": 0.0013999999737279722, "raw_times": [0.0873520000368444, 0.09012200007418869, 0.08926300006351084, 0.08806200003164122, 0.08946200000536919], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09113200007959676, "peak_bytes": 25231360, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5617658277733426e-08, "mae_k": 1.5788685914230882e-08, "mse_q": 2.4549424620164562e-15, "mse_k": 2.492823469483563e-15, "ref": "rotary_torch"}, "err": null} | |
| {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H8_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08857299985720601, "p50": 0.09020299989970226, "p90": 0.09035299990500789, "mean": 0.0900987999557401, "iqr": 0.00085999977272877, "raw_times": [0.08949300013227912, 0.09020299989970226, 0.09035299990500789, 0.08857299985720601, 0.09187199998450524], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09262200001103338, "peak_bytes": 12779520, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5962712041073246e-08, "mae_k": 1.5743363945830424e-08, "mse_q": 2.534145124782417e-15, "mse_k": 2.451281585618423e-15, "ref": "rotary_torch"}, "err": null} | |
| {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H8_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08818200012683519, "p50": 0.08903200000531797, "p90": 0.08924200005822058, "mean": 0.08891200000107347, "iqr": 0.0008400002116104588, "raw_times": [0.08903200000531797, 0.08840199984661012, 0.08818200012683519, 0.08924200005822058, 0.08970199996838346], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09153199994216266, "peak_bytes": 25427968, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.578730035589615e-08, "mae_k": 1.5859711766097462e-08, "mse_q": 2.440287521479536e-15, "mse_k": 2.477901290051784e-15, "ref": "rotary_torch"}, "err": null} | |
| {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H32_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08838300004754274, "p50": 0.08999199985737505, "p90": 0.0905619999684859, "mean": 0.09254639999198844, "iqr": 0.0011899999208253575, "raw_times": [0.0905619999684859, 0.10442300003887794, 0.08937200004766055, 0.08999199985737505, 0.08838300004754274], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.10543200005486142, "peak_bytes": 50462720, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5775295736375483e-08, "mae_k": 1.5847881229547056e-08, "mse_q": 2.471039476146077e-15, "mse_k": 2.472378635235686e-15, "ref": "rotary_torch"}, "err": null} | |
| {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H32_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08926200007408625, "p50": 0.08963200002654048, "p90": 0.09017300021696428, "mean": 0.08983220009213255, "iqr": 0.0005410001904238015, "raw_times": [0.08963200002654048, 0.09017300021696428, 0.08926200007408625, 0.08963200002654048, 0.09046200011653127], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09046300010595587, "peak_bytes": 100925440, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5959869870130206e-08, "mae_k": 1.588083975434529e-08, "mse_q": 2.510663677418633e-15, "mse_k": 2.502786271009168e-15, "ref": "rotary_torch"}, "err": null} | |
| {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H8_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08713199986232212, "p50": 0.08950200003710052, "p90": 0.08994299992082233, "mean": 0.08932219993766921, "iqr": 0.0012210000477352878, "raw_times": [0.08872199987308704, 0.08950200003710052, 0.08994299992082233, 0.09131199999501405, 0.08713199986232212], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09138199993685703, "peak_bytes": 51118080, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5894533689220225e-08, "mae_k": 1.5873395042831362e-08, "mse_q": 2.5093181655819197e-15, "mse_k": 2.488611809911578e-15, "ref": "rotary_torch"}, "err": null} | |
| {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H8_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08889199989425833, "p50": 0.09063199991032889, "p90": 0.09075200000552286, "mean": 0.09513419990980765, "iqr": 0.0011500001164677087, "raw_times": [0.08889199989425833, 0.08960199988905515, 0.09075200000552286, 0.09063199991032889, 0.11579299984987301], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09194200015372189, "peak_bytes": 101711872, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5936768349433805e-08, "mae_k": 1.5960043953100467e-08, "mse_q": 2.51039008577667e-15, "mse_k": 2.5111253103748867e-15, "ref": "rotary_torch"}, "err": null} | |
| {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H32_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08985299996311369, "p50": 0.09080199993150018, "p90": 0.09128199985752872, "mean": 0.09099019994209812, "iqr": 0.0010899998414970469, "raw_times": [0.08985299996311369, 0.09019200001603167, 0.09128199985752872, 0.09282199994231632, 0.09080199993150018], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09297199994762195, "peak_bytes": 201850880, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 1.9073486328125e-06, "absmax_k": 9.5367431640625e-07, "mae_q": 1.586510300910504e-08, "mae_k": 1.5813935050346117e-08, "mse_q": 2.499836478770355e-15, "mse_k": 2.4755639026338358e-15, "ref": "rotary_torch"}, "err": null} | |
| {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H32_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2603559998988203, "p50": 0.2619460001369589, "p90": 0.2620170000682265, "mean": 0.26208240001324157, "iqr": 0.00017100001059588976, "raw_times": [0.2619460001369589, 0.2603559998988203, 0.2618460000576306, 0.26424699990457157, 0.2620170000682265], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.26098600005752814, "peak_bytes": 403701760, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.581049247079136e-08, "mae_k": 1.5861061797295406e-08, "mse_q": 2.4735094242202705e-15, "mse_k": 2.486832828964107e-15, "ref": "rotary_torch"}, "err": null} | |
| {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H8_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0871929998993437, "p50": 0.08902200011107197, "p90": 0.08904199989956396, "mean": 0.08843839996188763, "iqr": 0.0015389998679893324, "raw_times": [0.08750300003157463, 0.08904199989956396, 0.08943199986788386, 0.08902200011107197, 0.0871929998993437], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09225299982063007, "peak_bytes": 137396224, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5824980437173508e-08, "mae_k": 1.5615324144846454e-08, "mse_q": 2.488090249374306e-15, "mse_k": 2.425079044911585e-15, "ref": "rotary_torch"}, "err": null} | |
| {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H8_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0884020000739838, "p50": 0.08922200004235492, "p90": 0.08970199996838346, "mean": 0.08935000005294569, "iqr": 0.0005199999577598646, "raw_times": [0.08922200004235492, 0.0884020000739838, 0.09024200016938266, 0.08970199996838346, 0.0891820000106236], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09462200000598386, "peak_bytes": 12648448, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5683587761827766e-08, "mae_k": 1.574532682013796e-08, "mse_q": 2.4310271220254415e-15, "mse_k": 2.4601385856313877e-15, "ref": "rotary_torch"}, "err": null} | |
| {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H32_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08808200004750688, "p50": 0.08919200013224327, "p90": 0.09035200014295697, "mean": 0.0894580000476708, "iqr": 0.0017100001059588976, "raw_times": [0.08808200004750688, 0.08919200013224327, 0.09035200014295697, 0.09102199987864878, 0.08864200003699807], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0948819999848638, "peak_bytes": 25198592, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5835009747888762e-08, "mae_k": 1.572560215379326e-08, "mse_q": 2.478222950813504e-15, "mse_k": 2.4541699679685603e-15, "ref": "rotary_torch"}, "err": null} | |
| {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H32_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08770199997343298, "p50": 0.08942199997363787, "p90": 0.08942299996306247, "mean": 0.08932019995882001, "iqr": 0.0003010000000358559, "raw_times": [0.08912199996302661, 0.09093199992094014, 0.08770199997343298, 0.08942299996306247, 0.08942199997363787], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09424199993190996, "peak_bytes": 50397184, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5876850056883995e-08, "mae_k": 1.5927410501603845e-08, "mse_q": 2.504224532953606e-15, "mse_k": 2.503892919554756e-15, "ref": "rotary_torch"}, "err": null} | |
| {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H8_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08902200011107197, "p50": 0.09058199998435157, "p90": 0.09125299993684166, "mean": 0.09215640002366854, "iqr": 0.0012609998520929366, "raw_times": [0.09993300000132876, 0.09058199998435157, 0.09125299993684166, 0.08902200011107197, 0.08999200008474872], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09323300014330016, "peak_bytes": 25362432, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5820052823301012e-08, "mae_k": 1.580205122309053e-08, "mse_q": 2.4876468276264184e-15, "mse_k": 2.4866062476507165e-15, "ref": "rotary_torch"}, "err": null} | |
| {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H8_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08776200002102996, "p50": 0.08892300002116826, "p90": 0.08966199993665214, "mean": 0.0888985999608849, "iqr": 0.001638999947317643, "raw_times": [0.09012299983623961, 0.08892300002116826, 0.08966199993665214, 0.08776200002102996, 0.0880229999893345], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09301299996877788, "peak_bytes": 50593792, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5823172105911e-08, "mae_k": 1.582038855474366e-08, "mse_q": 2.464257071579175e-15, "mse_k": 2.4775099608301526e-15, "ref": "rotary_torch"}, "err": null} | |
| {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H32_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0880819998201332, "p50": 0.08909200005291495, "p90": 0.08928200008995191, "mean": 0.08905000004233443, "iqr": 0.00023999996301427018, "raw_times": [0.0880819998201332, 0.08904200012693764, 0.08975200012173445, 0.08909200005291495, 0.08928200008995191], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09304200011683861, "peak_bytes": 100794368, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5888783622131086e-08, "mae_k": 1.5861886026868888e-08, "mse_q": 2.4766798685418433e-15, "mse_k": 2.475923891636419e-15, "ref": "rotary_torch"}, "err": null} | |
| {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H32_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08700200010025583, "p50": 0.08849200003169244, "p90": 0.0890119999894523, "mean": 0.08845600000313425, "iqr": 0.0007099999947968172, "raw_times": [0.0890119999894523, 0.08830199999465549, 0.08947199989961518, 0.08700200010025583, 0.08849200003169244], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09235300012733205, "peak_bytes": 201588736, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5826390864503992e-08, "mae_k": 1.5792682717119533e-08, "mse_q": 2.480465258783123e-15, "mse_k": 2.475580631534544e-15, "ref": "rotary_torch"}, "err": null} | |
| {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H8_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08904200012693764, "p50": 0.0900719999208377, "p90": 0.09035200014295697, "mean": 0.09022600006574066, "iqr": 0.0009600000794307562, "raw_times": [0.09035200014295697, 0.08904200012693764, 0.0922720000744448, 0.08939200006352621, 0.0900719999208377], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09144199998445401, "peak_bytes": 101449728, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.592899323554775e-08, "mae_k": 1.5925031959795888e-08, "mse_q": 2.50783882253954e-15, "mse_k": 2.5015648494992274e-15, "ref": "rotary_torch"}, "err": null} | |
| {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H8_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08612200008428772, "p50": 0.08916199999475793, "p90": 0.08966199993665214, "mean": 0.08842420002110885, "iqr": 0.002328999926248798, "raw_times": [0.08612200008428772, 0.0898420000794431, 0.08916199999475793, 0.08733300001040334, 0.08966199993665214], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09376299999530602, "peak_bytes": 202375168, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.590209919299923e-08, "mae_k": 1.590130160877834e-08, "mse_q": 2.4971026799330918e-15, "mse_k": 2.506967649153289e-15, "ref": "rotary_torch"}, "err": null} | |
| {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H32_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2547460001096624, "p50": 0.25804599999901257, "p90": 0.2586460000202351, "mean": 0.25757600001270475, "iqr": 0.0013200001376389991, "raw_times": [0.2547460001096624, 0.2591160000520176, 0.25804599999901257, 0.2586460000202351, 0.2573259998825961], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.25434600001972285, "peak_bytes": 403177472, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5847520629108658e-08, "mae_k": 1.5862454461057496e-08, "mse_q": 2.4917348203881045e-15, "mse_k": 2.491306009958557e-15, "ref": "rotary_torch"}, "err": null} | |
| {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H32_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.8454199999050616, "p50": 0.8495600000060222, "p90": 0.8538209999642277, "mean": 0.8503745999860257, "iqr": 0.0067099999796482734, "raw_times": [0.8559610000702378, 0.8538209999642277, 0.8471109999845794, 0.8495600000060222, 0.8454199999050616], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.8642910001981363, "peak_bytes": 806354944, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.585225106737198e-08, "mae_k": 1.581303976649906e-08, "mse_q": 2.4866460581992374e-15, "mse_k": 2.4721545950211372e-15, "ref": "rotary_torch"}, "err": null} | |