File size: 28,549 Bytes
3b25788 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 |
{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H8_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0716920001195831, "p50": 0.07250199996633455, "p90": 0.07283199988705746, "mean": 0.07240760000968294, "iqr": 0.0009109999155043624, "raw_times": [0.07250199996633455, 0.07283199988705746, 0.0716920001195831, 0.0719209999715531, 0.07309100010388647], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08089199991445639, "peak_bytes": 3178496, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.590452924915553e-08, "mae_k": 1.5487040982975486e-08, "mse_q": 2.5241010080938753e-15, "mse_k": 2.364223539299626e-15, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H8_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08928200008995191, "p50": 0.08997100007945846, "p90": 0.0909519999368058, "mean": 0.09690580000096816, "iqr": 0.001269999984288006, "raw_times": [0.08928200008995191, 0.08997100007945846, 0.0896819999525178, 0.0909519999368058, 0.12464199994610681], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.10248300009152445, "peak_bytes": 6356992, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5508486939097565e-08, "mae_k": 1.567566698668088e-08, "mse_q": 2.3630110116356316e-15, "mse_k": 2.416562128626943e-15, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H32_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08800199998404423, "p50": 0.08953199994721217, "p90": 0.09005199990497204, "mean": 0.08934599995882309, "iqr": 0.0015899997833912494, "raw_times": [0.08800199998404423, 0.0906819998363062, 0.08953199994721217, 0.08846200012158079, 0.09005199990497204], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08988199988380075, "peak_bytes": 12615680, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5856898016863852e-08, "mae_k": 1.572981211950264e-08, "mse_q": 2.4771055025978386e-15, "mse_k": 2.4544071371937915e-15, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H32_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08559200000490819, "p50": 0.08698200008439017, "p90": 0.08820199991532718, "mean": 0.08761399999457353, "iqr": 0.0018399998680251883, "raw_times": [0.08820199991532718, 0.08698200008439017, 0.09093199992094014, 0.08559200000490819, 0.08636200004730199], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09196199994221388, "peak_bytes": 25231360, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5617658277733426e-08, "mae_k": 1.5788685914230882e-08, "mse_q": 2.4549424620164562e-15, "mse_k": 2.492823469483563e-15, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H8_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0869620000685245, "p50": 0.08914199997889227, "p90": 0.08982199983620376, "mean": 0.08910199994716095, "iqr": 0.0019899998733308166, "raw_times": [0.08783199996287294, 0.08914199997889227, 0.08982199983620376, 0.0869620000685245, 0.09175199988931126], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08990199989966641, "peak_bytes": 12779520, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5962712041073246e-08, "mae_k": 1.5743363945830424e-08, "mse_q": 2.534145124782417e-15, "mse_k": 2.451281585618423e-15, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H8_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08547199990971421, "p50": 0.08752200005801569, "p90": 0.08800199998404423, "mean": 0.08713399993212079, "iqr": 0.002420000100755715, "raw_times": [0.08558199988328852, 0.08909199982554128, 0.08752200005801569, 0.08547199990971421, 0.08800199998404423], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0911110000743065, "peak_bytes": 25427968, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.578730035589615e-08, "mae_k": 1.5859711766097462e-08, "mse_q": 2.440287521479536e-15, "mse_k": 2.477901290051784e-15, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H32_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08664199981467391, "p50": 0.0881319999734842, "p90": 0.08973199987849512, "mean": 0.08822799991321517, "iqr": 0.0029299999368959107, "raw_times": [0.08664199981467391, 0.08983199995782343, 0.0881319999734842, 0.0868019999415992, 0.08973199987849512], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08937200004766055, "peak_bytes": 50462720, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5775295736375483e-08, "mae_k": 1.5847881229547056e-08, "mse_q": 2.471039476146077e-15, "mse_k": 2.472378635235686e-15, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H32_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08743200010030705, "p50": 0.08830199999465549, "p90": 0.08903200000531797, "mean": 0.08846400005495525, "iqr": 0.0009099999260797631, "raw_times": [0.08830199999465549, 0.08903200000531797, 0.08943200009525754, 0.0881220000792382, 0.08743200010030705], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09105200001613412, "peak_bytes": 100925440, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5959869870130206e-08, "mae_k": 1.588083975434529e-08, "mse_q": 2.510663677418633e-15, "mse_k": 2.502786271009168e-15, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H8_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08645200000501063, "p50": 0.08729199998924742, "p90": 0.08832200001052115, "mean": 0.08765380002841994, "iqr": 0.0011610000001383014, "raw_times": [0.08645200000501063, 0.08904200012693764, 0.08832200001052115, 0.08716100001038285, 0.08729199998924742], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08923200016397459, "peak_bytes": 51118080, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5894533689220225e-08, "mae_k": 1.5873395042831362e-08, "mse_q": 2.5093181655819197e-15, "mse_k": 2.488611809911578e-15, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H8_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08737200005271006, "p50": 0.08886199998414668, "p90": 0.08923200016397459, "mean": 0.08870620004017837, "iqr": 0.0006990001111262245, "raw_times": [0.08737200005271006, 0.08953199994721217, 0.08886199998414668, 0.08853300005284837, 0.08923200016397459], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08981199994195777, "peak_bytes": 101711872, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5936768349433805e-08, "mae_k": 1.5960043953100467e-08, "mse_q": 2.51039008577667e-15, "mse_k": 2.5111253103748867e-15, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H32_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08698100009496557, "p50": 0.0875120001637697, "p90": 0.08985099998426449, "mean": 0.08861960000103863, "iqr": 0.0027790001695393585, "raw_times": [0.0875120001637697, 0.08985099998426449, 0.08698100009496557, 0.09168199994746828, 0.08707199981472513], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09252200015907874, "peak_bytes": 201850880, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 1.9073486328125e-06, "absmax_k": 9.5367431640625e-07, "mae_q": 1.586510300910504e-08, "mae_k": 1.5813935050346117e-08, "mse_q": 2.499836478770355e-15, "mse_k": 2.4755639026338358e-15, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H32_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2610050000839692, "p50": 0.261626000110482, "p90": 0.26182599981439125, "mean": 0.261735599997337, "iqr": 0.0002109998149535386, "raw_times": [0.2616149999994377, 0.2610050000839692, 0.261626000110482, 0.26182599981439125, 0.2626059999784047], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.26071599995702854, "peak_bytes": 403701760, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.581049247079136e-08, "mae_k": 1.5861061797295406e-08, "mse_q": 2.4735094242202705e-15, "mse_k": 2.486832828964107e-15, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H8_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08662100003675732, "p50": 0.08793199981482758, "p90": 0.08872200010046072, "mean": 0.08787560000200756, "iqr": 0.0014409999948838959, "raw_times": [0.08662100003675732, 0.08872200010046072, 0.08882199995241535, 0.08793199981482758, 0.08728100010557682], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0905719998627319, "peak_bytes": 137396224, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5824980437173508e-08, "mae_k": 1.5615324144846454e-08, "mse_q": 2.488090249374306e-15, "mse_k": 2.425079044911585e-15, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H8_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08488200001011137, "p50": 0.08702200011612149, "p90": 0.08860200000526675, "mean": 0.08870799997566792, "iqr": 0.002460000132487039, "raw_times": [0.08488200001011137, 0.08702200011612149, 0.08860200000526675, 0.0861419998727797, 0.09689199987406028], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09050199992088892, "peak_bytes": 12648448, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5683587761827766e-08, "mae_k": 1.574532682013796e-08, "mse_q": 2.4310271220254415e-15, "mse_k": 2.4601385856313877e-15, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H32_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08632200001557067, "p50": 0.08908199993129529, "p90": 0.08911199984140694, "mean": 0.08852599994497723, "iqr": 0.0001399998836859595, "raw_times": [0.08632200001557067, 0.08914199997889227, 0.08911199984140694, 0.08908199993129529, 0.08897199995772098], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09170199996333395, "peak_bytes": 25198592, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5835009747888762e-08, "mae_k": 1.572560215379326e-08, "mse_q": 2.478222950813504e-15, "mse_k": 2.4541699679685603e-15, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H32_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08825200006867817, "p50": 0.08947200012698886, "p90": 0.08949199991548085, "mean": 0.08924200005822058, "iqr": 0.00026999987312592566, "raw_times": [0.08825200006867817, 0.08977200013760012, 0.08949199991548085, 0.08947200012698886, 0.08922200004235492], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09157099998446938, "peak_bytes": 50397184, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5876850056883995e-08, "mae_k": 1.5927410501603845e-08, "mse_q": 2.504224532953606e-15, "mse_k": 2.503892919554756e-15, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H8_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08679200004735321, "p50": 0.08709200005796447, "p90": 0.08838100006869354, "mean": 0.08772180003688845, "iqr": 0.0012890000107290689, "raw_times": [0.08709200005796447, 0.08679200004735321, 0.08925199995246658, 0.08709200005796447, 0.08838100006869354], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09222199992109381, "peak_bytes": 25362432, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5820052823301012e-08, "mae_k": 1.580205122309053e-08, "mse_q": 2.4876468276264184e-15, "mse_k": 2.4866062476507165e-15, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H8_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08846200012158079, "p50": 0.08976200001598045, "p90": 0.09003199988910637, "mean": 0.0897020000138582, "iqr": 0.00085999977272877, "raw_times": [0.08846200012158079, 0.09003199988910637, 0.08976200001598045, 0.0891720001163776, 0.09108199992624577], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0899420001587714, "peak_bytes": 50593792, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5823172105911e-08, "mae_k": 1.582038855474366e-08, "mse_q": 2.464257071579175e-15, "mse_k": 2.4775099608301526e-15, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H32_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08725199995751609, "p50": 0.08807200015326089, "p90": 0.08955199996307783, "mean": 0.08866800003488606, "iqr": 0.0017499999103165464, "raw_times": [0.08780200005276129, 0.08807200015326089, 0.08955199996307783, 0.08725199995751609, 0.09066200004781422], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.10307200000170269, "peak_bytes": 100794368, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5888783622131086e-08, "mae_k": 1.5861886026868888e-08, "mse_q": 2.4766798685418433e-15, "mse_k": 2.475923891636419e-15, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H32_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0861920000261307, "p50": 0.08893199992598966, "p90": 0.08899199997358664, "mean": 0.08807199997136195, "iqr": 0.0017600000319362152, "raw_times": [0.0861920000261307, 0.08723199994165043, 0.08893199992598966, 0.0890119999894523, 0.08899199997358664], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09088199999496283, "peak_bytes": 201588736, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5826390864503992e-08, "mae_k": 1.5792682717119533e-08, "mse_q": 2.480465258783123e-15, "mse_k": 2.475580631534544e-15, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H8_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08800199998404423, "p50": 0.08883200007403502, "p90": 0.08927299995775684, "mean": 0.08933019998949021, "iqr": 0.0010710000424296595, "raw_times": [0.08800199998404423, 0.08883200007403502, 0.09234200001628778, 0.08820199991532718, 0.08927299995775684], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09017200000016601, "peak_bytes": 101449728, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.592899323554775e-08, "mae_k": 1.5925031959795888e-08, "mse_q": 2.50783882253954e-15, "mse_k": 2.5015648494992274e-15, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H8_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08575199990445981, "p50": 0.08742199997868738, "p90": 0.08754200007388135, "mean": 0.08730620002097567, "iqr": 0.00023999996301427018, "raw_times": [0.08754200007388135, 0.08730200011086708, 0.08575199990445981, 0.0885130000369827, 0.08742199997868738], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0908419999632315, "peak_bytes": 202375168, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.590209919299923e-08, "mae_k": 1.590130160877834e-08, "mse_q": 2.4971026799330918e-15, "mse_k": 2.506967649153289e-15, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H32_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.25617599999350205, "p50": 0.2583450000201992, "p90": 0.2584750000096392, "mean": 0.2583615999810718, "iqr": 0.0005590000000665896, "raw_times": [0.2579160000095726, 0.2583450000201992, 0.25617599999350205, 0.2608959998724458, 0.2584750000096392], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2549850000832521, "peak_bytes": 403177472, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5847520629108658e-08, "mae_k": 1.5862454461057496e-08, "mse_q": 2.4917348203881045e-15, "mse_k": 2.491306009958557e-15, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H32_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.8457680000901746, "p50": 0.8511980001912889, "p90": 0.8513080001648632, "mean": 0.8505584000886302, "iqr": 0.003619000153776142, "raw_times": [0.8513080001648632, 0.8457680000901746, 0.847689000011087, 0.8511980001912889, 0.856828999985737], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.8516880000115634, "peak_bytes": 806354944, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.585225106737198e-08, "mae_k": 1.581303976649906e-08, "mse_q": 2.4866460581992374e-15, "mse_k": 2.4721545950211372e-15, "ref": "rotary_torch"}, "err": null}
|