{"ts": "2026-01-04T21:50:23Z", "run": "8cc422ce4a7e441a8a168de4e51d3f05", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S128_W2", "batch": 2, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07355200000347395, "p50": 0.07458200002474769, "p90": 0.07503200004066457, "mean": 0.07451980004589132, "iqr": 0.001300999883824261, "raw_times": [0.07503200004066457, 0.07570200000373006, 0.07355200000347395, 0.07373100015684031, 0.07458200002474769], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.07890100005170098, "peak_bytes": 295936, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} {"ts": "2026-01-04T21:50:23Z", "run": "8cc422ce4a7e441a8a168de4e51d3f05", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S128_W4", "batch": 2, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08308199994644383, "p50": 0.08427200009464286, "p90": 0.08442200009994849, "mean": 0.08427200004916813, "iqr": 0.0003300001480965875, "raw_times": [0.08427200009464286, 0.0840919999518519, 0.08549200015295355, 0.08308199994644383, 0.08442200009994849], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0866919999680249, "peak_bytes": 296448, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} {"ts": "2026-01-04T21:50:23Z", "run": "8cc422ce4a7e441a8a168de4e51d3f05", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S512_W2", "batch": 2, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0821520000044984, "p50": 0.08325199996761512, "p90": 0.08339200007867476, "mean": 0.0833319999401283, "iqr": 0.0011800002539530396, "raw_times": [0.08221199982472172, 0.0856519998251315, 0.08339200007867476, 0.0821520000044984, 0.08325199996761512], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08935200003179489, "peak_bytes": 1180672, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} {"ts": "2026-01-04T21:50:24Z", "run": "8cc422ce4a7e441a8a168de4e51d3f05", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S512_W4", "batch": 2, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08073100002548017, "p50": 0.08201199989343877, "p90": 0.08312199997817515, "mean": 0.08611579996795626, "iqr": 0.0017499999103165464, "raw_times": [0.08137200006785861, 0.10334199987482862, 0.08312199997817515, 0.08073100002548017, 0.08201199989343877], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08758200010561268, "peak_bytes": 1181184, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} {"ts": "2026-01-04T21:50:24Z", "run": "8cc422ce4a7e441a8a168de4e51d3f05", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S2048_W2", "batch": 2, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08196199996746145, "p50": 0.082431999999244, "p90": 0.08386199988308363, "mean": 0.08293399996546214, "iqr": 0.0018599998838908505, "raw_times": [0.08441199997832882, 0.08386199988308363, 0.082431999999244, 0.08200199999919278, 0.08196199996746145], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0874719999046647, "peak_bytes": 4719616, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} {"ts": "2026-01-04T21:50:24Z", "run": "8cc422ce4a7e441a8a168de4e51d3f05", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S2048_W4", "batch": 2, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08103200002551603, "p50": 0.08222199994634138, "p90": 0.0833209999200335, "mean": 0.08253780001723499, "iqr": 0.0019089998204435688, "raw_times": [0.08103200002551603, 0.08470200009469409, 0.0833209999200335, 0.08222199994634138, 0.08141200009958993], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08676199990986788, "peak_bytes": 4720128, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} {"ts": "2026-01-04T21:50:24Z", "run": "8cc422ce4a7e441a8a168de4e51d3f05", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S128_W2", "batch": 2, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08134199993037328, "p50": 0.08267199996225827, "p90": 0.08288200001516088, "mean": 0.08249380002780526, "iqr": 0.000270999862550525, "raw_times": [0.08261100015261036, 0.08267199996225827, 0.08296200007862353, 0.08288200001516088, 0.08134199993037328], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.11819299993476307, "peak_bytes": 9461760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} {"ts": "2026-01-04T21:50:24Z", "run": "8cc422ce4a7e441a8a168de4e51d3f05", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S128_W4", "batch": 2, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08164199994098453, "p50": 0.08249200004684099, "p90": 0.08269199997812393, "mean": 0.08447800000794814, "iqr": 0.0007899998308857903, "raw_times": [0.08164199994098453, 0.08269199997812393, 0.08249200004684099, 0.09366199992655311, 0.08190200014723814], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08653200006847328, "peak_bytes": 9478144, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} {"ts": "2026-01-04T21:50:24Z", "run": "8cc422ce4a7e441a8a168de4e51d3f05", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S512_W2", "batch": 2, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08105199981400801, "p50": 0.08159200001500722, "p90": 0.0820519999251701, "mean": 0.08180199993148562, "iqr": 0.000470000031782547, "raw_times": [0.08105199981400801, 0.08159200001500722, 0.08273200000985526, 0.0820519999251701, 0.08158199989338755], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08815199998934986, "peak_bytes": 37773312, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} {"ts": "2026-01-04T21:50:24Z", "run": "8cc422ce4a7e441a8a168de4e51d3f05", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S512_W4", "batch": 2, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08115200012071, "p50": 0.08377199992537498, "p90": 0.08408199983023223, "mean": 0.0831979999475152, "iqr": 0.001199999815071351, "raw_times": [0.08408199983023223, 0.0841019998460979, 0.08377199992537498, 0.08115200012071, 0.08288200001516088], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0894119998520182, "peak_bytes": 37789696, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} {"ts": "2026-01-04T21:50:24Z", "run": "8cc422ce4a7e441a8a168de4e51d3f05", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S2048_W2", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.15871300001890631, "p50": 0.15948400005072472, "p90": 0.16057400011959544, "mean": 0.15980340003807214, "iqr": 0.0012509999578469433, "raw_times": [0.15948400005072472, 0.15871300001890631, 0.16092299983938574, 0.16057400011959544, 0.1593230001617485], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.16102299991871405, "peak_bytes": 151019520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} {"ts": "2026-01-04T21:50:24Z", "run": "8cc422ce4a7e441a8a168de4e51d3f05", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S2048_W4", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.16207299995585345, "p50": 0.1630340000247088, "p90": 0.16339400008291705, "mean": 0.1631278000331804, "iqr": 0.000580000005356851, "raw_times": [0.16207299995585345, 0.16339400008291705, 0.1628140000775602, 0.1630340000247088, 0.16432400002486247], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.1644829999349895, "peak_bytes": 151035904, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} {"ts": "2026-01-04T21:50:24Z", "run": "8cc422ce4a7e441a8a168de4e51d3f05", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S128_W2", "batch": 4, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08101100002022577, "p50": 0.08208200006265542, "p90": 0.08235099994635675, "mean": 0.08208539998122433, "iqr": 0.00027999999474559445, "raw_times": [0.08207099995161116, 0.08235099994635675, 0.08291199992527254, 0.08101100002022577, 0.08208200006265542], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08729199998924742, "peak_bytes": 33727488, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} {"ts": "2026-01-04T21:50:24Z", "run": "8cc422ce4a7e441a8a168de4e51d3f05", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S128_W4", "batch": 4, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08070199987741944, "p50": 0.08170099999915692, "p90": 0.08236199983002734, "mean": 0.08180379995792464, "iqr": 0.00085999977272877, "raw_times": [0.08150200005729857, 0.08236199983002734, 0.08275200002572092, 0.08170099999915692, 0.08070199987741944], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0839819999782776, "peak_bytes": 591360, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} {"ts": "2026-01-04T21:50:24Z", "run": "8cc422ce4a7e441a8a168de4e51d3f05", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S512_W2", "batch": 4, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08031200013647322, "p50": 0.08140199997797026, "p90": 0.08294200006275787, "mean": 0.08193580001716327, "iqr": 0.0019210001482861117, "raw_times": [0.08031200013647322, 0.08294200006275787, 0.08140199997797026, 0.08400199999414326, 0.08102099991447176], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08704200013198715, "peak_bytes": 2360320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} {"ts": "2026-01-04T21:50:24Z", "run": "8cc422ce4a7e441a8a168de4e51d3f05", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S512_W4", "batch": 4, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08123200018417265, "p50": 0.08208200006265542, "p90": 0.08256199998868397, "mean": 0.08218960006161069, "iqr": 0.0008409999736613827, "raw_times": [0.08208200006265542, 0.08335100005751883, 0.08256199998868397, 0.08123200018417265, 0.08172100001502258], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08772199998929864, "peak_bytes": 2360832, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} {"ts": "2026-01-04T21:50:24Z", "run": "8cc422ce4a7e441a8a168de4e51d3f05", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S2048_W2", "batch": 4, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08169200009433553, "p50": 0.08265200017376628, "p90": 0.08277200004158658, "mean": 0.08255980001194985, "iqr": 0.0007500002539018169, "raw_times": [0.08366099996237608, 0.08169200009433553, 0.08277200004158658, 0.08265200017376628, 0.08202199978768476], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08715200010556146, "peak_bytes": 9438208, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} {"ts": "2026-01-04T21:50:24Z", "run": "8cc422ce4a7e441a8a168de4e51d3f05", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S2048_W4", "batch": 4, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08216200012611807, "p50": 0.08315100012623589, "p90": 0.08342199998878641, "mean": 0.08315960003528744, "iqr": 0.0005510000846697949, "raw_times": [0.08287099990411662, 0.08342199998878641, 0.08216200012611807, 0.08315100012623589, 0.08419200003118021], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08595200006311643, "peak_bytes": 9438720, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} {"ts": "2026-01-04T21:50:24Z", "run": "8cc422ce4a7e441a8a168de4e51d3f05", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S128_W2", "batch": 4, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08146200002556725, "p50": 0.08273099979305698, "p90": 0.08284199998342956, "mean": 0.08254179992945865, "iqr": 0.00034000004234258085, "raw_times": [0.08250199994108698, 0.08146200002556725, 0.08273099979305698, 0.08317199990415247, 0.08284199998342956], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0868520000949502, "peak_bytes": 18931712, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} {"ts": "2026-01-04T21:50:24Z", "run": "8cc422ce4a7e441a8a168de4e51d3f05", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S128_W4", "batch": 4, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07963099983498978, "p50": 0.08124200007841864, "p90": 0.08184200009964115, "mean": 0.08129759999064845, "iqr": 0.000601000238020788, "raw_times": [0.08124099986162037, 0.08184200009964115, 0.08253200007857231, 0.07963099983498978, 0.08124200007841864], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08599200009484775, "peak_bytes": 18948096, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} {"ts": "2026-01-04T21:50:24Z", "run": "8cc422ce4a7e441a8a168de4e51d3f05", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S512_W2", "batch": 4, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09385199996359006, "p50": 0.09531199998491502, "p90": 0.0969730001543212, "mean": 0.09663020005064027, "iqr": 0.0026110001272172667, "raw_times": [0.09385199996359006, 0.0969730001543212, 0.09531199998491502, 0.10265200012327114, 0.09436200002710393], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09528199984742969, "peak_bytes": 75522048, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} {"ts": "2026-01-04T21:50:24Z", "run": "8cc422ce4a7e441a8a168de4e51d3f05", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S512_W4", "batch": 4, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09982300002775446, "p50": 0.10141300003851939, "p90": 0.10143200006496045, "mean": 0.10232060003545485, "iqr": 0.0002189999577240087, "raw_times": [0.09982300002775446, 0.10143200006496045, 0.10121300010723644, 0.10772199993880349, 0.10141300003851939], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.10236299999633047, "peak_bytes": 75538432, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} {"ts": "2026-01-04T21:50:24Z", "run": "8cc422ce4a7e441a8a168de4e51d3f05", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S2048_W2", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.4812109998511005, "p50": 0.48388999994131154, "p90": 0.4854010001054121, "mean": 0.48431479999635485, "iqr": 0.002060000042547472, "raw_times": [0.48388999994131154, 0.4812109998511005, 0.4877310000210855, 0.4854010001054121, 0.4833410000628646], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.48655099999450613, "peak_bytes": 302014464, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} {"ts": "2026-01-04T21:50:24Z", "run": "8cc422ce4a7e441a8a168de4e51d3f05", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S2048_W4", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.4982410000593518, "p50": 0.4998210001758707, "p90": 0.5022709999593644, "mean": 0.5008630000247649, "iqr": 0.0036199999158270657, "raw_times": [0.4998210001758707, 0.4982410000593518, 0.5053309998857003, 0.49865100004353735, 0.5022709999593644], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.5012599999645317, "peak_bytes": 302030848, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}