{"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S128_W2", "batch": 2, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.047261000020171196, "p50": 0.04859200004148079, "p90": 0.0489899999820409, "mean": 0.048763200015855546, "iqr": 0.0006179999445521389, "raw_times": [0.050600999998096086, 0.0489899999820409, 0.04837200003748876, 0.04859200004148079, 0.047261000020171196], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.06049099999927421, "peak_bytes": 295936, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} {"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S128_W4", "batch": 2, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.053621000006387476, "p50": 0.05462100000386272, "p90": 0.05485100001578758, "mean": 0.054479000004903355, "iqr": 0.0006300000450210064, "raw_times": [0.05462100000386272, 0.053621000006387476, 0.05485100001578758, 0.055081000027712435, 0.05422099997076657], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05994100001771585, "peak_bytes": 296448, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} {"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S512_W2", "batch": 2, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052632000006269664, "p50": 0.054361000024982786, "p90": 0.05462100000386272, "mean": 0.05404320000934604, "iqr": 0.0009999999974752427, "raw_times": [0.054361000024982786, 0.05498100000522754, 0.053621000006387476, 0.05462100000386272, 0.052632000006269664], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.058602000024166045, "peak_bytes": 1180672, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} {"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S512_W4", "batch": 2, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05227200000490484, "p50": 0.05285200001026169, "p90": 0.053781000019625935, "mean": 0.05329160001110722, "iqr": 0.0009899999895424116, "raw_times": [0.05285200001026169, 0.05227200000490484, 0.054761999990660115, 0.053781000019625935, 0.052791000030083524], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05603200003179154, "peak_bytes": 1181184, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} {"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S2048_W2", "batch": 2, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05200100002866748, "p50": 0.054180999995878665, "p90": 0.05433100000118429, "mean": 0.05350320001298314, "iqr": 0.0019299999962640868, "raw_times": [0.054602000034265075, 0.05433100000118429, 0.052401000004920206, 0.05200100002866748, 0.054180999995878665], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05716200001870675, "peak_bytes": 4719616, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} {"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S2048_W4", "batch": 2, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05303100004994121, "p50": 0.05431100004216205, "p90": 0.05439099999193786, "mean": 0.053947000014886726, "iqr": 0.0011999999856016075, "raw_times": [0.05439099999193786, 0.05481099998405625, 0.05319100000633625, 0.05431100004216205, 0.05303100004994121], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05832099998315243, "peak_bytes": 4720128, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} {"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S128_W2", "batch": 2, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05105100001401297, "p50": 0.05330099997991056, "p90": 0.05380099997864818, "mean": 0.054202999990593526, "iqr": 0.0006199999802447564, "raw_times": [0.05105100001401297, 0.05330099997991056, 0.05318099999840342, 0.0596809999819925, 0.05380099997864818], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.056061000009322015, "peak_bytes": 9461760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} {"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S128_W4", "batch": 2, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05164100002730265, "p50": 0.052870999979859334, "p90": 0.05319100000633625, "mean": 0.05809520000639168, "iqr": 0.0004199999921183917, "raw_times": [0.05277100001421786, 0.052870999979859334, 0.05319100000633625, 0.05164100002730265, 0.08000200000424229], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05684100000280523, "peak_bytes": 9478144, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} {"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S512_W2", "batch": 2, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.050990999966415984, "p50": 0.05245100004458436, "p90": 0.05260099999304657, "mean": 0.05224100000305043, "iqr": 0.0006000000212225132, "raw_times": [0.050990999966415984, 0.05316100003938118, 0.05200099997182406, 0.05260099999304657, 0.05245100004458436], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05579200001193385, "peak_bytes": 37773312, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} {"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S512_W4", "batch": 2, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05112100001269937, "p50": 0.0527509999983522, "p90": 0.053382000032797805, "mean": 0.05273720000786852, "iqr": 0.0022010000293448684, "raw_times": [0.05112100001269937, 0.0527509999983522, 0.055250999992040306, 0.053382000032797805, 0.05118100000345294], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05559099997753947, "peak_bytes": 37789696, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} {"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S2048_W2", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.051831000007496186, "p50": 0.05281099998910577, "p90": 0.0528209999970386, "mean": 0.05249119999461982, "iqr": 0.0007599999776175537, "raw_times": [0.0528209999970386, 0.052061000019421044, 0.051831000007496186, 0.05281099998910577, 0.0529319999600375], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05652100003317173, "peak_bytes": 151019520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} {"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S2048_W4", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05226100000754741, "p50": 0.053051999998388055, "p90": 0.053290999971977726, "mean": 0.05317119998835551, "iqr": 0.00048099997229655855, "raw_times": [0.05226100000754741, 0.054441999964183196, 0.053051999998388055, 0.05280999999968117, 0.053290999971977726], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05758100002140054, "peak_bytes": 151035904, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} {"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S128_W2", "batch": 4, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05227100001548024, "p50": 0.05277099995737444, "p90": 0.05359099998258898, "mean": 0.053112999989934906, "iqr": 0.0010999999631167157, "raw_times": [0.05227100001548024, 0.0544409999747586, 0.05359099998258898, 0.05249100001947227, 0.05277099995737444], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05578100001457642, "peak_bytes": 33727488, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} {"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S128_W4", "batch": 4, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.051241999983631104, "p50": 0.052721000031397125, "p90": 0.05347100000108185, "mean": 0.05272120000654468, "iqr": 0.0009499999578110874, "raw_times": [0.05252100004327076, 0.052721000031397125, 0.05347100000108185, 0.05365099997334255, 0.051241999983631104], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.07047099995816097, "peak_bytes": 591360, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} {"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S512_W2", "batch": 4, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0512010000193186, "p50": 0.052960999994411395, "p90": 0.05432099999325146, "mean": 0.05357720000347399, "iqr": 0.0024799999778224446, "raw_times": [0.0512010000193186, 0.05756199999495948, 0.05184100001542902, 0.05432099999325146, 0.052960999994411395], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05626099999744838, "peak_bytes": 2360320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} {"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S512_W4", "batch": 4, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05130099998496007, "p50": 0.05301099997723213, "p90": 0.05393200001435616, "mean": 0.05291720000286659, "iqr": 0.0015410000173687877, "raw_times": [0.053951000040797226, 0.05393200001435616, 0.05130099998496007, 0.05301099997723213, 0.052390999996987375], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.056620999998813204, "peak_bytes": 2360832, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} {"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S2048_W2", "batch": 4, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052390999996987375, "p50": 0.052880999987792165, "p90": 0.05298999997194187, "mean": 0.05283679998910884, "iqr": 0.00045899997758169775, "raw_times": [0.05253099999436017, 0.05339099999446262, 0.052390999996987375, 0.052880999987792165, 0.05298999997194187], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.057541000046512636, "peak_bytes": 9438208, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} {"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S2048_W4", "batch": 4, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05287200002612735, "p50": 0.05326199999444725, "p90": 0.05488099998274265, "mean": 0.05554340000344382, "iqr": 0.0019099999803984247, "raw_times": [0.05287200002612735, 0.05326199999444725, 0.052971000002344226, 0.06373100001155763, 0.05488099998274265], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05706100000679726, "peak_bytes": 9438720, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} {"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S128_W2", "batch": 4, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05237200002738973, "p50": 0.05340200004866347, "p90": 0.054441999964183196, "mean": 0.05358960000876323, "iqr": 0.0017409999486517336, "raw_times": [0.05237200002738973, 0.05503099998804828, 0.054441999964183196, 0.05340200004866347, 0.05270100001553146], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.056061000009322015, "peak_bytes": 18931712, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} {"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S128_W4", "batch": 4, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05089100000077451, "p50": 0.054600999987997056, "p90": 0.054670999986683455, "mean": 0.05481719998670087, "iqr": 0.001720000000204891, "raw_times": [0.05089100000077451, 0.054670999986683455, 0.054600999987997056, 0.06097199997157077, 0.052950999986478564], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05628200000273864, "peak_bytes": 18948096, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} {"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S512_W2", "batch": 4, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052641000024777895, "p50": 0.05274100004726279, "p90": 0.05537099997354744, "mean": 0.05482900000970403, "iqr": 0.002670000014859397, "raw_times": [0.05274100004726279, 0.052700999958688044, 0.05537099997354744, 0.06069100004424399, 0.052641000024777895], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.056130999951164995, "peak_bytes": 75522048, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} {"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S512_W4", "batch": 4, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05228100002341307, "p50": 0.05332099999577622, "p90": 0.053600999990521814, "mean": 0.05359900000030393, "iqr": 0.0007799999934832158, "raw_times": [0.05228100002341307, 0.05332099999577622, 0.0528209999970386, 0.055970999994769954, 0.053600999990521814], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05611099999214275, "peak_bytes": 75538432, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} {"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S2048_W2", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0509510000483715, "p50": 0.05250099997056168, "p90": 0.05270100001553146, "mean": 0.052225199999611505, "iqr": 0.000470000031782547, "raw_times": [0.05270100001553146, 0.0509510000483715, 0.052230999983748916, 0.05250099997056168, 0.05274199997984397], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05471000002899018, "peak_bytes": 302014464, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} {"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S2048_W4", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05066099998884965, "p50": 0.05315100003144835, "p90": 0.05323099998122416, "mean": 0.052780999999413325, "iqr": 0.0009899999895424116, "raw_times": [0.05224099999168175, 0.05462100000386272, 0.05323099998122416, 0.05315100003144835, 0.05066099998884965], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05779099996061632, "peak_bytes": 302030848, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}