| {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H8_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.17331300000478223, "p50": 0.17603300000246236, "p90": 0.1797429999896849, "mean": 0.1784169999950791, "iqr": 0.0038800000083938357, "raw_times": [0.17603300000246236, 0.17586299998129107, 0.1797429999896849, 0.18713299999717492, 0.17331300000478223], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.18657300000768373, "peak_bytes": 1720320, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} | |
| {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H8_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21556299998337636, "p50": 0.2165239999953883, "p90": 0.21698299997297, "mean": 0.21635159998822928, "iqr": 0.0013189999776841432, "raw_times": [0.2165239999953883, 0.21698299997297, 0.21566399999528585, 0.21556299998337636, 0.21702399999412592], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21905399995603148, "peak_bytes": 3440640, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} | |
| {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H32_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21391299998185787, "p50": 0.21503299996084024, "p90": 0.21681300000864212, "mean": 0.21537540000053923, "iqr": 0.0027289999593449465, "raw_times": [0.21503299996084024, 0.21408400004929717, 0.21681300000864212, 0.21703400000205875, 0.21391299998185787], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2214840000078766, "peak_bytes": 6832128, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} | |
| {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H32_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21356299998842587, "p50": 0.2151840000124139, "p90": 0.2162740000244412, "mean": 0.21532140000317668, "iqr": 0.0011410000411160581, "raw_times": [0.2162740000244412, 0.21513299998332513, 0.2151840000124139, 0.2164530000072773, 0.21356299998842587], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2165939999940747, "peak_bytes": 13664256, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} | |
| {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H8_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21375400001488742, "p50": 0.21507399998199617, "p90": 0.21535299998731716, "mean": 0.21505959999785773, "iqr": 0.0006099999723119254, "raw_times": [0.21474300001500524, 0.21375400001488742, 0.21535299998731716, 0.21507399998199617, 0.21637399999008267], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2174030000219318, "peak_bytes": 6881280, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} | |
| {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H8_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2112430000238419, "p50": 0.21400400004267794, "p90": 0.21425299996735703, "mean": 0.21312160000661606, "iqr": 0.002878999964650575, "raw_times": [0.21137400000270645, 0.21425299996735703, 0.21400400004267794, 0.214733999996497, 0.2112430000238419], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22874399996908323, "peak_bytes": 13762560, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} | |
| {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H32_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21246400001473376, "p50": 0.2133630000002995, "p90": 0.21390399996334963, "mean": 0.2133594000042649, "iqr": 0.0008009999419300584, "raw_times": [0.21396300002152202, 0.21246400001473376, 0.21390399996334963, 0.21310300002141958, 0.2133630000002995], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2195139999798812, "peak_bytes": 27328512, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} | |
| {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H32_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21415399999114015, "p50": 0.21443299999646115, "p90": 0.2147029999832739, "mean": 0.2148253999962435, "iqr": 0.000368999963029637, "raw_times": [0.21650299999009803, 0.21415399999114015, 0.21443299999646115, 0.21433400002024428, 0.2147029999832739], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2173330000232454, "peak_bytes": 54657024, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} | |
| {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H8_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21359300001222437, "p50": 0.2138830000149028, "p90": 0.21400299999640993, "mean": 0.21457699999700708, "iqr": 0.00012000003835055395, "raw_times": [0.21400299999640993, 0.2138830000149028, 0.21359300001222437, 0.21752300000343894, 0.21388299995805937], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2191329999732261, "peak_bytes": 27525120, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} | |
| {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H8_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21296400001347138, "p50": 0.21389400001226022, "p90": 0.21517300001505646, "mean": 0.21466560000362733, "iqr": 0.0013790000252811296, "raw_times": [0.21296400001347138, 0.21517300001505646, 0.21750299998757328, 0.21379399998977533, 0.21389400001226022], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21542299998600356, "peak_bytes": 55050240, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} | |
| {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H32_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2098030000183826, "p50": 0.21347300003071723, "p90": 0.21457399998325855, "mean": 0.21505920001345658, "iqr": 0.0023309999619414157, "raw_times": [0.2098030000183826, 0.21457399998325855, 0.21347300003071723, 0.21224300002131713, 0.22520300001360738], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21741399996244581, "peak_bytes": 109314048, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} | |
| {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H32_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.22297399999615664, "p50": 0.22381400003723684, "p90": 0.22385300002270014, "mean": 0.2239618000203336, "iqr": 0.0007890000119914475, "raw_times": [0.2230640000107087, 0.22385300002270014, 0.22381400003723684, 0.22610400003486575, 0.22297399999615664], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22540300000173374, "peak_bytes": 218628096, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} | |
| {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H8_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21275400001741218, "p50": 0.21372400004793235, "p90": 0.21630299994512825, "mean": 0.22107159999222858, "iqr": 0.0030399999673136335, "raw_times": [0.21372400004793235, 0.24931399997285553, 0.21630299994512825, 0.21326299997781462, 0.21275400001741218], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21886299998641334, "peak_bytes": 68698112, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} | |
| {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H8_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21458399999119138, "p50": 0.21627299997817317, "p90": 0.21634299997685957, "mean": 0.21600339998713025, "iqr": 0.0007099999947968172, "raw_times": [0.21627299997817317, 0.21718400000736438, 0.21563299998206276, 0.21458399999119138, 0.21634299997685957], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2226130000053672, "peak_bytes": 6848512, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} | |
| {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H32_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2140940000003866, "p50": 0.215932999992674, "p90": 0.21619400001782196, "mean": 0.21597160000510485, "iqr": 0.0015699999948992627, "raw_times": [0.2140940000003866, 0.2146240000229227, 0.215932999992674, 0.21619400001782196, 0.21901299999171897], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2184540000484958, "peak_bytes": 13647872, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} | |
| {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H32_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21190300003581797, "p50": 0.21745400005102056, "p90": 0.21756400002459486, "mean": 0.21624960003236993, "iqr": 0.0009400000067216752, "raw_times": [0.21190300003581797, 0.21745400005102056, 0.21756400002459486, 0.2166240000178732, 0.21770300003254306], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.25062399998887486, "peak_bytes": 27295744, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} | |
| {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H8_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21696300001394775, "p50": 0.21815399998104112, "p90": 0.21820400002070528, "mean": 0.21879360000411907, "iqr": 0.0004510000053414842, "raw_times": [0.2177530000153638, 0.21815399998104112, 0.2228939999895374, 0.21696300001394775, 0.21820400002070528], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2236640000319312, "peak_bytes": 13697024, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} | |
| {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H8_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21379300000035073, "p50": 0.21643299999141163, "p90": 0.21674399999938032, "mean": 0.21709340001052624, "iqr": 0.00039999997625272954, "raw_times": [0.21379300000035073, 0.21674399999938032, 0.21643299999141163, 0.2163440000231276, 0.22215300003836091], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21868300001415264, "peak_bytes": 27394048, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} | |
| {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H32_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2133630000002995, "p50": 0.21632300001783733, "p90": 0.21671399997558183, "mean": 0.21582319999424726, "iqr": 0.0009309999882134434, "raw_times": [0.21693299999014926, 0.2133630000002995, 0.21632300001783733, 0.21671399997558183, 0.21578299998736838], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2180729999849973, "peak_bytes": 54591488, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} | |
| {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H32_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21294399999760572, "p50": 0.21446299996341622, "p90": 0.21984300002486634, "mean": 0.21647359999406035, "iqr": 0.006489000043075066, "raw_times": [0.21335399998179128, 0.2217640000026222, 0.21984300002486634, 0.21294399999760572, 0.21446299996341622], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21826299996519083, "peak_bytes": 109182976, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} | |
| {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H8_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21578299998736838, "p50": 0.21700399997826025, "p90": 0.2204729999562005, "mean": 0.21918559997402554, "iqr": 0.004118999981983507, "raw_times": [0.21700399997826025, 0.22631399997408153, 0.2204729999562005, 0.216353999974217, 0.21578299998736838], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22091399995360916, "peak_bytes": 54788096, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} | |
| {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H8_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2135729999963587, "p50": 0.2144540000017514, "p90": 0.2173039999888715, "mean": 0.21536960000503313, "iqr": 0.003270999968663091, "raw_times": [0.21403300002020842, 0.2144540000017514, 0.2135729999963587, 0.2173039999888715, 0.21748400001797563], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22203300000001036, "peak_bytes": 109576192, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} | |
| {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H32_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.22941399998899215, "p50": 0.23028300000760282, "p90": 0.23160400002097958, "mean": 0.23061779999125065, "iqr": 0.0017800000478018774, "raw_times": [0.231963999965501, 0.22941399998899215, 0.2298239999731777, 0.23160400002097958, 0.23028300000760282], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23190299998532282, "peak_bytes": 218365952, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} | |
| {"ts": "2025-12-19T18:55:30Z", "run": "4727714419ad4272bf9529583f4df39b", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H32_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.6428210000422041, "p50": 0.6484909999926458, "p90": 0.6486400000085268, "mean": 0.6472164000115299, "iqr": 0.0035200000070290116, "raw_times": [0.651010000012775, 0.6428210000422041, 0.6486400000085268, 0.6451200000014978, 0.6484909999926458], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.6451109999829896, "peak_bytes": 436731904, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} | |