drbh's picture
drbh HF Staff
Upload folder using huggingface_hub
e8e4be6 verified
raw
history blame
25.8 kB
{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H8_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.1724160000549091, "p50": 0.17308600001797458, "p90": 0.1756759999125279, "mean": 0.1760500000045795, "iqr": 0.0032199998258874984, "raw_times": [0.17245600008664042, 0.1756759999125279, 0.1724160000549091, 0.17308600001797458, 0.18661599995084543], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.17975699995531613, "peak_bytes": 1720320, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H8_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.22642799990535423, "p50": 0.2294280000114668, "p90": 0.23093799995876907, "mean": 0.23135619996992318, "iqr": 0.0026599999500831473, "raw_times": [0.23093799995876907, 0.22642799990535423, 0.22827800000868592, 0.2417089999653399, 0.2294280000114668], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23494799995660287, "peak_bytes": 3440640, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H32_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21688800006813835, "p50": 0.21992799997860857, "p90": 0.2219079999576934, "mean": 0.22172000001319248, "iqr": 0.004439999884198187, "raw_times": [0.2174680000734952, 0.2219079999576934, 0.21688800006813835, 0.23240799998802686, 0.21992799997860857], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.225418000013633, "peak_bytes": 6832128, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H32_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21487700007583044, "p50": 0.21964699999443837, "p90": 0.22132800006602338, "mean": 0.21978760003094067, "iqr": 0.005100000066704524, "raw_times": [0.21487700007583044, 0.21622799999931885, 0.21964699999443837, 0.22132800006602338, 0.2268580000190923], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.24882799993974913, "peak_bytes": 13664256, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H8_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21332699998311, "p50": 0.21615699995436444, "p90": 0.21744800005762954, "mean": 0.21590960000139603, "iqr": 0.0025000000505315256, "raw_times": [0.21332699998311, 0.21744800005762954, 0.21494800000709802, 0.21766800000477815, 0.21615699995436444], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22001800005000405, "peak_bytes": 6881280, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H8_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21038799991401902, "p50": 0.21561700009442575, "p90": 0.21720800009461527, "mean": 0.22098599999935686, "iqr": 0.004100000182916119, "raw_times": [0.21038799991401902, 0.21720800009461527, 0.21561700009442575, 0.24860899998202513, 0.21310799991169915], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2178580000418151, "peak_bytes": 13762560, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H32_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21404700009952649, "p50": 0.21557699994900759, "p90": 0.2158679999411106, "mean": 0.2152116000161186, "iqr": 0.0011999999287581886, "raw_times": [0.2158679999411106, 0.21589800007859594, 0.21404700009952649, 0.21466800001235242, 0.21557699994900759], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21567799990407366, "peak_bytes": 27328512, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H32_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21701799994389148, "p50": 0.21822700000484474, "p90": 0.22002800005793688, "mean": 0.2237478000097326, "iqr": 0.002031000008173578, "raw_times": [0.22002800005793688, 0.2179970000497633, 0.2454689999922266, 0.21822700000484474, 0.21701799994389148], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22291799996310147, "peak_bytes": 54657024, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H8_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21712800003115262, "p50": 0.21885700004986575, "p90": 0.2196080000658185, "mean": 0.22401780001928273, "iqr": 0.001630000042496249, "raw_times": [0.21797800002332224, 0.2196080000658185, 0.24651799992625456, 0.21885700004986575, 0.21712800003115262], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2206780000051367, "peak_bytes": 27525120, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H8_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21462800009430794, "p50": 0.21782799990432977, "p90": 0.21795700001803198, "mean": 0.21911359999648994, "iqr": 0.0030300000162242213, "raw_times": [0.21462800009430794, 0.23022799996397225, 0.21782799990432977, 0.21492700000180776, 0.21795700001803198], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2186980000260519, "peak_bytes": 55050240, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H32_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21579799999926763, "p50": 0.21701699995446688, "p90": 0.22130799993647088, "mean": 0.2237457999626713, "iqr": 0.004450000005817856, "raw_times": [0.21701699995446688, 0.22130799993647088, 0.21579799999926763, 0.24774799999249808, 0.21685799993065302], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22235700009787251, "peak_bytes": 109314048, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H32_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.22434800007431477, "p50": 0.2248280000003433, "p90": 0.22490799995011912, "mean": 0.22479799997654482, "iqr": 0.00031000001854408765, "raw_times": [0.2248280000003433, 0.22490799995011912, 0.22459799993157503, 0.22434800007431477, 0.22530799992637185], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23522799995134847, "peak_bytes": 218628096, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H8_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21574699997017888, "p50": 0.21802799994929956, "p90": 0.21904799996264046, "mean": 0.22033179998288688, "iqr": 0.0018999999156221747, "raw_times": [0.21714800004701829, 0.21802799994929956, 0.2316879999852972, 0.21904799996264046, 0.21574699997017888], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22168800001054478, "peak_bytes": 68698112, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H8_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21753800001533818, "p50": 0.21888800006308884, "p90": 0.22129700005280029, "mean": 0.22190180004599824, "iqr": 0.003358999947522534, "raw_times": [0.21753800001533818, 0.23384799999348616, 0.21793800010527775, 0.21888800006308884, 0.22129700005280029], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22266799999215436, "peak_bytes": 6848512, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H32_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2166670000178783, "p50": 0.21850699999959033, "p90": 0.21964699999443837, "mean": 0.21864339998955984, "iqr": 0.001419000000169035, "raw_times": [0.21850699999959033, 0.2166670000178783, 0.22016799994162284, 0.21822799999426934, 0.21964699999443837], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23552799996195972, "peak_bytes": 13647872, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H32_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21624800001518452, "p50": 0.21773700007088337, "p90": 0.21802799994929956, "mean": 0.21774760000425886, "iqr": 0.0013409999155555852, "raw_times": [0.21668700003374397, 0.21773700007088337, 0.22003799995218287, 0.21624800001518452, 0.21802799994929956], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2226780000000872, "peak_bytes": 27295744, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H8_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21748699998624943, "p50": 0.22014700005001941, "p90": 0.22206799997093185, "mean": 0.22232159999475698, "iqr": 0.0019999999949504854, "raw_times": [0.22014700005001941, 0.23183799999060284, 0.22206799997093185, 0.21748699998624943, 0.22006799997598137], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22040800001832395, "peak_bytes": 13697024, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H8_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21601800006010308, "p50": 0.21957800004202, "p90": 0.22023799999715266, "mean": 0.2213318000030995, "iqr": 0.0024510000002919696, "raw_times": [0.23303799991936103, 0.21601800006010308, 0.2177869999968607, 0.21957800004202, 0.22023799999715266], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.220787999978711, "peak_bytes": 27394048, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H32_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21692799998618284, "p50": 0.22003699996275827, "p90": 0.2230679999684071, "mean": 0.222287800011145, "iqr": 0.0031599998919773498, "raw_times": [0.21692799998618284, 0.21990800007642974, 0.2314980000619471, 0.2230679999684071, 0.22003699996275827], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22102700006598752, "peak_bytes": 54591488, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H32_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2160679999860804, "p50": 0.21972700005790102, "p90": 0.22029800004474964, "mean": 0.21970960001453932, "iqr": 0.0024610000082248007, "raw_times": [0.2160679999860804, 0.2246179999474407, 0.22029800004474964, 0.21972700005790102, 0.21783700003652484], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22191799996562622, "peak_bytes": 109182976, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H8_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2172279999967941, "p50": 0.21847799996521644, "p90": 0.22105800007921061, "mean": 0.22193580000475777, "iqr": 0.0035110000453641987, "raw_times": [0.21847799996521644, 0.22105800007921061, 0.23536799994872126, 0.21754700003384642, 0.2172279999967941], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22206799997093185, "peak_bytes": 54788096, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H8_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21436800000174117, "p50": 0.21785799992812827, "p90": 0.2195579999124675, "mean": 0.2202379999744153, "iqr": 0.0030299999025373836, "raw_times": [0.21436800000174117, 0.21785799992812827, 0.2195579999124675, 0.2165280000099301, 0.2328780000198094], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.25353900002755836, "peak_bytes": 109576192, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H32_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.22968799999034673, "p50": 0.23015800002212927, "p90": 0.23064800006977748, "mean": 0.23369620002995362, "iqr": 0.0006600000688194996, "raw_times": [0.23015800002212927, 0.24799900006655662, 0.22968799999034673, 0.22998800000095798, 0.23064800006977748], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23042800000894204, "peak_bytes": 218365952, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H32_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.6347319999804313, "p50": 0.6375930000785957, "p90": 0.639283000055002, "mean": 0.6376124000325945, "iqr": 0.003270999968663091, "raw_times": [0.6375930000785957, 0.636012000086339, 0.6404419999626043, 0.6347319999804313, 0.639283000055002], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.639422999938688, "peak_bytes": 436731904, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}