File size: 24,462 Bytes
3b25788
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S128_W2", "batch": 2, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04374100012682902, "p50": 0.046281000095405034, "p90": 0.04699100009020185, "mean": 0.04588520009747299, "iqr": 0.0018789999103319133, "raw_times": [0.0473009999950591, 0.04699100009020185, 0.04511200017986994, 0.046281000095405034, 0.04374100012682902], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.054351000017049955, "peak_bytes": 295936, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S128_W4", "batch": 2, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05076099978396087, "p50": 0.05160099999557133, "p90": 0.0517210000907653, "mean": 0.05158899998605193, "iqr": 0.00034000004234258085, "raw_times": [0.05076099978396087, 0.052481000011539436, 0.05138100004842272, 0.0517210000907653, 0.05160099999557133], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.06265199999688775, "peak_bytes": 296448, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S512_W2", "batch": 2, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04984099996363511, "p50": 0.05232199987403874, "p90": 0.05389100010688708, "mean": 0.05356520000532328, "iqr": 0.0016999999843392288, "raw_times": [0.04984099996363511, 0.05389100010688708, 0.05232199987403874, 0.05958099995950761, 0.05219100012254785], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05472199995892879, "peak_bytes": 1180672, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S512_W4", "batch": 2, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.050031999990096665, "p50": 0.0509510000483715, "p90": 0.0512909998633404, "mean": 0.05097519997434574, "iqr": 0.0009099996987060877, "raw_times": [0.050381000164634315, 0.0512909998633404, 0.05222099980528583, 0.0509510000483715, 0.050031999990096665], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05585200005953084, "peak_bytes": 1181184, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S2048_W2", "batch": 2, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.048549999974056846, "p50": 0.04953099983140419, "p90": 0.04999000020688982, "mean": 0.04990460001863539, "iqr": 0.0006790000952605624, "raw_times": [0.04953099983140419, 0.052140999969196855, 0.04999000020688982, 0.049311000111629255, 0.048549999974056846], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.054900999884921475, "peak_bytes": 4719616, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S2048_W4", "batch": 2, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04891100002168969, "p50": 0.04958099998475518, "p90": 0.05115099997965444, "mean": 0.05000500000278407, "iqr": 0.002170000016121776, "raw_times": [0.04891100002168969, 0.05140100006428838, 0.04958099998475518, 0.05115099997965444, 0.04898099996353267], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05376100011744711, "peak_bytes": 4720128, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S128_W2", "batch": 2, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04850100003750413, "p50": 0.04982199993719405, "p90": 0.04984099996363511, "mean": 0.04968119997101894, "iqr": 0.0003499999365885742, "raw_times": [0.04850100003750413, 0.04984099996363511, 0.04949100002704654, 0.050750999889714876, 0.04982199993719405], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.054501000022355583, "peak_bytes": 9461760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S128_W4", "batch": 2, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.049300999990009586, "p50": 0.04983100006938912, "p90": 0.050170999884358025, "mean": 0.05779919997621619, "iqr": 0.0004900000476482091, "raw_times": [0.09001200010061439, 0.04983100006938912, 0.049680999836709816, 0.050170999884358025, 0.049300999990009586], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.052752000101463636, "peak_bytes": 9478144, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S512_W2", "batch": 2, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.047761000132595655, "p50": 0.04934100002174091, "p90": 0.049350999915986904, "mean": 0.0488690000565839, "iqr": 0.0008099998467514524, "raw_times": [0.04854100006923545, 0.04935100014336058, 0.049350999915986904, 0.04934100002174091, 0.047761000132595655], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.052481000011539436, "peak_bytes": 37773312, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S512_W4", "batch": 2, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.048450999884153134, "p50": 0.04965099992659816, "p90": 0.050621000127648585, "mean": 0.05171900002096663, "iqr": 0.0015900000107649248, "raw_times": [0.048450999884153134, 0.04903100011688366, 0.04965099992659816, 0.050621000127648585, 0.06084100004954962], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.053670999932364793, "peak_bytes": 37789696, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S2048_W2", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.047580999989804695, "p50": 0.0487410000005184, "p90": 0.050201000021843356, "mean": 0.04913100001431303, "iqr": 0.002320000021427404, "raw_times": [0.047580999989804695, 0.050201000021843356, 0.051251000058982754, 0.04788100000041595, 0.0487410000005184], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.054761000001235516, "peak_bytes": 151019520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S2048_W4", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.049270999852524255, "p50": 0.049540999953023857, "p90": 0.04967099994246382, "mean": 0.049752999939300935, "iqr": 0.0003000000106112566, "raw_times": [0.049370999931852566, 0.049540999953023857, 0.049270999852524255, 0.05091100001664017, 0.04967099994246382], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.053771000011693104, "peak_bytes": 151035904, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S128_W2", "batch": 4, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04857099997934711, "p50": 0.04948099990542687, "p90": 0.049690999958329485, "mean": 0.04942899995512562, "iqr": 0.0004900000476482091, "raw_times": [0.04857099997934711, 0.04948099990542687, 0.049690999958329485, 0.049200999910681276, 0.050201000021843356], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05436199990072055, "peak_bytes": 33727488, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S128_W4", "batch": 4, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.048160999995161546, "p50": 0.049621000016486505, "p90": 0.05022099981033534, "mean": 0.04960719993505336, "iqr": 0.0007689998255955288, "raw_times": [0.048160999995161546, 0.049621000016486505, 0.05022099981033534, 0.049451999984739814, 0.050580999868543586], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.053051000122650294, "peak_bytes": 591360, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S512_W2", "batch": 4, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04980199992132839, "p50": 0.05055099995843193, "p90": 0.050551000185805606, "mean": 0.05498319997059298, "iqr": 0.00047000025915622246, "raw_times": [0.07393099986074958, 0.05008099992664938, 0.05055099995843193, 0.050551000185805606, 0.04980199992132839], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.053621999995812075, "peak_bytes": 2360320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S512_W4", "batch": 4, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.048960999947667005, "p50": 0.05131100010657974, "p90": 0.052661000154330395, "mean": 0.051906999988204916, "iqr": 0.0029200002700235927, "raw_times": [0.048960999947667005, 0.052661000154330395, 0.056860999848140636, 0.0497409998843068, 0.05131100010657974], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.053932000128043, "peak_bytes": 2360832, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S2048_W2", "batch": 4, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04804100012734125, "p50": 0.04966200003764243, "p90": 0.05004099989491806, "mean": 0.04977120001967705, "iqr": 0.0007399999049084727, "raw_times": [0.04804100012734125, 0.05181100004847394, 0.04966200003764243, 0.05004099989491806, 0.049300999990009586], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05391199988480366, "peak_bytes": 9438208, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S2048_W4", "batch": 4, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.047730999995110324, "p50": 0.04978099991603813, "p90": 0.049860999979500775, "mean": 0.04935500001010951, "iqr": 0.0009599998520570807, "raw_times": [0.047730999995110324, 0.04978099991603813, 0.05050100003245461, 0.048901000127443695, 0.049860999979500775], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0537809999059391, "peak_bytes": 9438720, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S128_W2", "batch": 4, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.049270999852524255, "p50": 0.05024100005357468, "p90": 0.05084099984742352, "mean": 0.050164999993285164, "iqr": 0.0012699997569143306, "raw_times": [0.049270999852524255, 0.05024100005357468, 0.05084099984742352, 0.05090100012239418, 0.04957100009050919], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05270199994811264, "peak_bytes": 18931712, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S128_W4", "batch": 4, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04698099996858218, "p50": 0.049690999958329485, "p90": 0.04984100019100879, "mean": 0.049111000043922104, "iqr": 0.00083000008999079, "raw_times": [0.04698099996858218, 0.04984100019100879, 0.050031000000672066, 0.049011000101018, 0.049690999958329485], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05244099997980811, "peak_bytes": 18948096, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S512_W2", "batch": 4, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04767199993693794, "p50": 0.049621000016486505, "p90": 0.04999099996894074, "mean": 0.049529399984749034, "iqr": 0.0007789999472151976, "raw_times": [0.04767199993693794, 0.049621000016486505, 0.05115099997965444, 0.04999099996894074, 0.049212000021725544], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05385099984778208, "peak_bytes": 75522048, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S512_W4", "batch": 4, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04878100003224972, "p50": 0.049651000153971836, "p90": 0.050100999942515045, "mean": 0.04970100003447442, "iqr": 0.0005699998837371822, "raw_times": [0.04878100003224972, 0.050440999984857626, 0.049651000153971836, 0.04953100005877786, 0.050100999942515045], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.054691000059392536, "peak_bytes": 75538432, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S2048_W2", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04801099998985592, "p50": 0.049041000011129654, "p90": 0.04916099987894995, "mean": 0.04916500001854729, "iqr": 0.00038999974094622303, "raw_times": [0.04801099998985592, 0.050841000074797194, 0.04916099987894995, 0.04877100013800373, 0.049041000011129654], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05514100007530942, "peak_bytes": 302014464, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S2048_W4", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04843099986828747, "p50": 0.04957099986313551, "p90": 0.04984099996363511, "mean": 0.04942899995512562, "iqr": 0.00046999980440887157, "raw_times": [0.04843099986828747, 0.04937100015922624, 0.049930999921343755, 0.04957099986313551, 0.04984099996363511], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05356199994821509, "peak_bytes": 302030848, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}