drbh's picture
drbh HF Staff
Upload folder using huggingface_hub
1c22380 verified
raw
history blame
28.5 kB
{"ts": "2025-10-29T15:50:31Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H8_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0735019999638098, "p50": 0.07410199998503231, "p90": 0.07441199994673298, "mean": 0.07416379996811884, "iqr": 0.00038999996831989847, "raw_times": [0.07478099996660603, 0.0735019999638098, 0.07441199994673298, 0.07402199997841308, 0.07410199998503231], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08146199996872383, "peak_bytes": 3178496, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.590452924915553e-08, "mae_k": 1.5487040982975486e-08, "mse_q": 2.5241010080938753e-15, "mse_k": 2.364223539299626e-15, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-10-29T15:50:31Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H8_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0912220000373054, "p50": 0.09200200003078862, "p90": 0.09276200000840618, "mean": 0.09224400001812683, "iqr": 0.0012400000173329317, "raw_times": [0.09152199999107324, 0.09276200000840618, 0.0912220000373054, 0.09200200003078862, 0.09371200002306068], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09689300003401513, "peak_bytes": 6356992, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5508486939097565e-08, "mae_k": 1.567566698668088e-08, "mse_q": 2.3630110116356316e-15, "mse_k": 2.416562128626943e-15, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-10-29T15:50:31Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H32_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08820200002901402, "p50": 0.09085200002800775, "p90": 0.0915720000307374, "mean": 0.09087420002060753, "iqr": 0.002170000016121776, "raw_times": [0.08820200002901402, 0.09434300000066287, 0.08940200001461562, 0.0915720000307374, 0.09085200002800775], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0964319999638974, "peak_bytes": 12615680, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5856898016863852e-08, "mae_k": 1.572981211950264e-08, "mse_q": 2.4771055025978386e-15, "mse_k": 2.4544071371937915e-15, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-10-29T15:50:31Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H32_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09069200001476929, "p50": 0.09134200001881254, "p90": 0.09142199996858835, "mean": 0.09263220000548245, "iqr": 0.0006699999630654929, "raw_times": [0.09069200001476929, 0.09075200000552286, 0.09142199996858835, 0.09895300001971918, 0.09134200001881254], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09313199996086041, "peak_bytes": 25231360, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5617658277733426e-08, "mae_k": 1.5788685914230882e-08, "mse_q": 2.4549424620164562e-15, "mse_k": 2.492823469483563e-15, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-10-29T15:50:31Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H8_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0885120000475581, "p50": 0.08998200001997247, "p90": 0.09122199998046199, "mean": 0.09028400000943293, "iqr": 0.0016600000094513234, "raw_times": [0.09122199998046199, 0.0885120000475581, 0.09214200002816142, 0.08998200001997247, 0.08956199997101066], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.1227330000119764, "peak_bytes": 12779520, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5962712041073246e-08, "mae_k": 1.5743363945830424e-08, "mse_q": 2.534145124782417e-15, "mse_k": 2.451281585618423e-15, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-10-29T15:50:31Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H8_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08860200000526675, "p50": 0.09058200004119499, "p90": 0.09118299999499868, "mean": 0.09031840000943703, "iqr": 0.0011699999618031143, "raw_times": [0.08860200000526675, 0.09001300003319557, 0.09058200004119499, 0.09121199997252916, 0.09118299999499868], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09078199997247793, "peak_bytes": 25427968, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.578730035589615e-08, "mae_k": 1.5859711766097462e-08, "mse_q": 2.440287521479536e-15, "mse_k": 2.477901290051784e-15, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-10-29T15:50:31Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H32_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08772200004614206, "p50": 0.09064199997510514, "p90": 0.09105200001613412, "mean": 0.08990000001176668, "iqr": 0.002190000031987438, "raw_times": [0.08772200004614206, 0.09064199997510514, 0.09105200001613412, 0.0912220000373054, 0.08886199998414668], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09194199998319164, "peak_bytes": 50462720, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5775295736375483e-08, "mae_k": 1.5847881229547056e-08, "mse_q": 2.471039476146077e-15, "mse_k": 2.472378635235686e-15, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H32_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08855200002244601, "p50": 0.08938199999874996, "p90": 0.0907319999896572, "mean": 0.0897739999913938, "iqr": 0.0015100000041456951, "raw_times": [0.0892219999855115, 0.0909819999606043, 0.0907319999896572, 0.08855200002244601, 0.08938199999874996], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09457200002316313, "peak_bytes": 100925440, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5959869870130206e-08, "mae_k": 1.588083975434529e-08, "mse_q": 2.510663677418633e-15, "mse_k": 2.502786271009168e-15, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H8_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08880199999339311, "p50": 0.08953200000405559, "p90": 0.08999199997106189, "mean": 0.08967999999640597, "iqr": 0.0006899999789311551, "raw_times": [0.08880199999339311, 0.08930199999213073, 0.08999199997106189, 0.09077200002138852, 0.08953200000405559], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09282199999915974, "peak_bytes": 51118080, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5894533689220225e-08, "mae_k": 1.5873395042831362e-08, "mse_q": 2.5093181655819197e-15, "mse_k": 2.488611809911578e-15, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H8_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08904199995640738, "p50": 0.09102199999233562, "p90": 0.09121199997252916, "mean": 0.0907579999761765, "iqr": 0.0006099999723119254, "raw_times": [0.08904199995640738, 0.09191199995939314, 0.09121199997252916, 0.09060200000021723, 0.09102199999233562], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09379199997283649, "peak_bytes": 101711872, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5936768349433805e-08, "mae_k": 1.5960043953100467e-08, "mse_q": 2.51039008577667e-15, "mse_k": 2.5111253103748867e-15, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H32_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09005199996181545, "p50": 0.09118200000557408, "p90": 0.0916120000056253, "mean": 0.09133820000215565, "iqr": 0.0005590000000665896, "raw_times": [0.09005199996181545, 0.09105300000555872, 0.09279200003220467, 0.0916120000056253, 0.09118200000557408], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09626199999956953, "peak_bytes": 201850880, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 1.9073486328125e-06, "absmax_k": 9.5367431640625e-07, "mae_q": 1.586510300910504e-08, "mae_k": 1.5813935050346117e-08, "mse_q": 2.499836478770355e-15, "mse_k": 2.4755639026338358e-15, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H32_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2600759999609181, "p50": 0.261636000004728, "p90": 0.2620960000285777, "mean": 0.26208780000160914, "iqr": 0.0012810000384888554, "raw_times": [0.2600759999609181, 0.261636000004728, 0.26581600002373307, 0.26081499999008884, 0.2620960000285777], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.263886000027469, "peak_bytes": 403701760, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.581049247079136e-08, "mae_k": 1.5861061797295406e-08, "mse_q": 2.4735094242202705e-15, "mse_k": 2.486832828964107e-15, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H8_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08898199996565381, "p50": 0.09088199999496283, "p90": 0.09099199996853713, "mean": 0.09348599999157159, "iqr": 0.001969999971151992, "raw_times": [0.08898199996565381, 0.09099199996853713, 0.10755200003131904, 0.09088199999496283, 0.08902199999738514], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09600300001011419, "peak_bytes": 137396224, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5824980437173508e-08, "mae_k": 1.5615324144846454e-08, "mse_q": 2.488090249374306e-15, "mse_k": 2.425079044911585e-15, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H8_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08880199999339311, "p50": 0.09035199997242671, "p90": 0.09093199997778356, "mean": 0.09011999998165265, "iqr": 0.0015400000279441883, "raw_times": [0.08939199994983937, 0.09093199997778356, 0.09035199997242671, 0.09112200001482051, 0.08880199999339311], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09145199999238685, "peak_bytes": 12648448, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5683587761827766e-08, "mae_k": 1.574532682013796e-08, "mse_q": 2.4310271220254415e-15, "mse_k": 2.4601385856313877e-15, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H32_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08985199997368909, "p50": 0.09101199998440279, "p90": 0.09125200000426048, "mean": 0.09087419999787016, "iqr": 0.0002900000026784255, "raw_times": [0.08985199997368909, 0.09096200000158206, 0.0912930000254164, 0.09125200000426048, 0.09101199998440279], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09303199999521894, "peak_bytes": 25198592, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5835009747888762e-08, "mae_k": 1.572560215379326e-08, "mse_q": 2.478222950813504e-15, "mse_k": 2.4541699679685603e-15, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H32_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08871200003568447, "p50": 0.0907719999645451, "p90": 0.09140200000956611, "mean": 0.09065600000894847, "iqr": 0.001259999976355175, "raw_times": [0.08871200003568447, 0.09225200000173572, 0.09014200003321093, 0.0907719999645451, 0.09140200000956611], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09131100000558945, "peak_bytes": 50397184, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5876850056883995e-08, "mae_k": 1.5927410501603845e-08, "mse_q": 2.504224532953606e-15, "mse_k": 2.503892919554756e-15, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H8_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08793200004220125, "p50": 0.0902419999988524, "p90": 0.09114200003068618, "mean": 0.09024000002000321, "iqr": 0.001160000010713702, "raw_times": [0.08793200004220125, 0.08998200001997247, 0.0902419999988524, 0.09114200003068618, 0.09190200000830373], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09403199999269418, "peak_bytes": 25362432, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5820052823301012e-08, "mae_k": 1.580205122309053e-08, "mse_q": 2.4876468276264184e-15, "mse_k": 2.4866062476507165e-15, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H8_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08730199999718025, "p50": 0.0906619999909708, "p90": 0.09115200003861901, "mean": 0.08998400001019036, "iqr": 0.0016399999935856613, "raw_times": [0.08730199999718025, 0.09115200003861901, 0.09129199997914839, 0.08951200004503335, 0.0906619999909708], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09093099998835896, "peak_bytes": 50593792, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5823172105911e-08, "mae_k": 1.582038855474366e-08, "mse_q": 2.464257071579175e-15, "mse_k": 2.4775099608301526e-15, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H32_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08923199999344433, "p50": 0.09018200000809884, "p90": 0.09221200002684782, "mean": 0.09105200000476543, "iqr": 0.0028300000280978566, "raw_times": [0.08923199999344433, 0.09221200002684782, 0.09018200000809884, 0.08938199999874996, 0.09425199999668621], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09410199999138058, "peak_bytes": 100794368, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5888783622131086e-08, "mae_k": 1.5861886026868888e-08, "mse_q": 2.4766798685418433e-15, "mse_k": 2.475923891636419e-15, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H32_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08850200003962527, "p50": 0.0899920000279053, "p90": 0.09176200001093093, "mean": 0.09526220001134789, "iqr": 0.002740000013545796, "raw_times": [0.08850200003962527, 0.0899920000279053, 0.11703299998089278, 0.08902199999738514, 0.09176200001093093], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09607300000880059, "peak_bytes": 201588736, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5826390864503992e-08, "mae_k": 1.5792682717119533e-08, "mse_q": 2.480465258783123e-15, "mse_k": 2.475580631534544e-15, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H8_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0902020000239645, "p50": 0.09163200002149097, "p90": 0.09188199999243807, "mean": 0.09142600000586754, "iqr": 0.0006299999881775875, "raw_times": [0.09163200002149097, 0.09216199998718366, 0.09188199999243807, 0.09125200000426048, 0.0902020000239645], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09537199997566859, "peak_bytes": 101449728, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.592899323554775e-08, "mae_k": 1.5925031959795888e-08, "mse_q": 2.50783882253954e-15, "mse_k": 2.5015648494992274e-15, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H8_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08815199998934986, "p50": 0.08920199996964584, "p90": 0.0900620000265917, "mean": 0.08925999999291889, "iqr": 0.001270000041131425, "raw_times": [0.08815199998934986, 0.09009199999354678, 0.0900620000265917, 0.08920199996964584, 0.08879199998546028], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09250199997268282, "peak_bytes": 202375168, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.590209919299923e-08, "mae_k": 1.590130160877834e-08, "mse_q": 2.4971026799330918e-15, "mse_k": 2.506967649153289e-15, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H32_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.26207600001271203, "p50": 0.263255999982448, "p90": 0.2654460000144354, "mean": 0.26436599999897226, "iqr": 0.0022400000148081745, "raw_times": [0.26207600001271203, 0.263255999982448, 0.2678459999856386, 0.26320599999962724, 0.2654460000144354], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.25824599998713893, "peak_bytes": 403177472, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5847520629108658e-08, "mae_k": 1.5862454461057496e-08, "mse_q": 2.4917348203881045e-15, "mse_k": 2.491306009958557e-15, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H32_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.8428699999853961, "p50": 0.8440990000053716, "p90": 0.8457790000306886, "mean": 0.8458453999992344, "iqr": 0.0025290000280620006, "raw_times": [0.8428699999853961, 0.8532289999720888, 0.8432500000026266, 0.8440990000053716, 0.8457790000306886], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.8568399999830945, "peak_bytes": 806354944, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.585225106737198e-08, "mae_k": 1.581303976649906e-08, "mse_q": 2.4866460581992374e-15, "mse_k": 2.4721545950211372e-15, "ref": "rotary_torch"}, "err": null}