diff --git "a/layer_norm/impls/artifacts/benchmark/layer_norm.jsonl" "b/layer_norm/impls/artifacts/benchmark/layer_norm.jsonl" --- "a/layer_norm/impls/artifacts/benchmark/layer_norm.jsonl" +++ "b/layer_norm/impls/artifacts/benchmark/layer_norm.jsonl" @@ -1,48 +1,48 @@ -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B1_S128_D1024", "batch": 1, "seq_len": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.028959999326616526, "p50": 0.029670016374439, "p90": 0.02976099494844675, "mean": 0.030270603019744158, "iqr": 0.00046996865421533585, "raw_times": [0.028959999326616526, 0.02976099494844675, 0.029291026294231415, 0.029670016374439, 0.0336709781549871], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03676099004223943, "peak_bytes": 2363392, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00151824951171875, "mse": 1.1086463928222656e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B1_S128_D2048", "batch": 1, "seq_len": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.0324210268445313, "p50": 0.03255100455135107, "p90": 0.03347999881953001, "mean": 0.0366490101441741, "iqr": 0.0010589719749987125, "raw_times": [0.0324210268445313, 0.05237199366092682, 0.03347999881953001, 0.03255100455135107, 0.0324210268445313], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03719097003340721, "peak_bytes": 4726784, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0014801025390625, "mse": 1.0728836059570312e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B1_S128_D4096", "batch": 1, "seq_len": 128, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.030800001695752144, "p50": 0.032610027119517326, "p90": 0.03269099397584796, "mean": 0.03422859590500593, "iqr": 0.0001300359144806862, "raw_times": [0.042480998672544956, 0.03256095806136727, 0.032610027119517326, 0.03269099397584796, 0.030800001695752144], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.038141035474836826, "peak_bytes": 9453568, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00153350830078125, "mse": 1.0967254638671875e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B1_S128_D8192", "batch": 1, "seq_len": 128, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.031030969694256783, "p50": 0.03305997233837843, "p90": 0.03334099892526865, "mean": 0.034240796230733395, "iqr": 0.0011699739843606949, "raw_times": [0.031030969694256783, 0.03334099892526865, 0.03305997233837843, 0.032171024940907955, 0.041601015254855156], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03404001472517848, "peak_bytes": 18907136, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00148773193359375, "mse": 1.1086463928222656e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B1_S512_D1024", "batch": 1, "seq_len": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.030020950362086296, "p50": 0.03277999348938465, "p90": 0.03290100721642375, "mean": 0.03224459942430258, "iqr": 0.0006309710443019867, "raw_times": [0.03227003617212176, 0.03290100721642375, 0.03277999348938465, 0.030020950362086296, 0.03325100988149643], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.034740951377898455, "peak_bytes": 9441280, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.001556396484375, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B1_S512_D2048", "batch": 1, "seq_len": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.029260001610964537, "p50": 0.031291041523218155, "p90": 0.03236101474612951, "mean": 0.03568681422621012, "iqr": 0.0016400008462369442, "raw_times": [0.029260001610964537, 0.03236101474612951, 0.054800999350845814, 0.03072101389989257, 0.031291041523218155], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03347097663208842, "peak_bytes": 18882560, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00154876708984375, "mse": 1.1265277862548828e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B1_S512_D4096", "batch": 1, "seq_len": 512, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.029839982744306326, "p50": 0.031700998079031706, "p90": 0.03264000406488776, "mean": 0.031820591539144516, "iqr": 0.0009690411388874054, "raw_times": [0.029839982744306326, 0.03167096292600036, 0.03264000406488776, 0.031700998079031706, 0.03325100988149643], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03392098005861044, "peak_bytes": 37765120, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015869140625, "mse": 1.1563301086425781e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B1_S512_D8192", "batch": 1, "seq_len": 512, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.03726099384948611, "p50": 0.03943097544834018, "p90": 0.040319981053471565, "mean": 0.039948790799826384, "iqr": 0.0009189825505018234, "raw_times": [0.03726099384948611, 0.03940099850296974, 0.04333100514486432, 0.040319981053471565, 0.03943097544834018], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.040440005250275135, "peak_bytes": 75530240, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0014801025390625, "mse": 1.0848045349121094e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B1_S1024_D1024", "batch": 1, "seq_len": 1024, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.029730028472840786, "p50": 0.03197102341800928, "p90": 0.03233103780075908, "mean": 0.03148262621834874, "iqr": 0.001300009898841381, "raw_times": [0.029730028472840786, 0.03197102341800928, 0.03235001349821687, 0.031031027901917696, 0.03233103780075908], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.034419994335621595, "peak_bytes": 21008384, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.1324882507324219e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B1_S1024_D2048", "batch": 1, "seq_len": 1024, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.030681025236845016, "p50": 0.031400995794683695, "p90": 0.03258103970438242, "mean": 0.03199680941179395, "iqr": 0.0015910482034087181, "raw_times": [0.030681025236845016, 0.03258103970438242, 0.034330994822084904, 0.0309899915009737, 0.031400995794683695], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03291096072643995, "peak_bytes": 37756928, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00156402587890625, "mse": 1.138448715209961e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B1_S1024_D4096", "batch": 1, "seq_len": 1024, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.03773096250370145, "p50": 0.03855105023831129, "p90": 0.03870099317282438, "mean": 0.038839003536850214, "iqr": 0.00018998980522155762, "raw_times": [0.03773096250370145, 0.04070100840181112, 0.03855105023831129, 0.038511003367602825, 0.03870099317282438], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04059000639244914, "peak_bytes": 75513856, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B1_S1024_D8192", "batch": 1, "seq_len": 1024, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.05229096859693527, "p50": 0.05274196155369282, "p90": 0.052841962315142155, "mean": 0.05267937667667866, "iqr": 0.00024097971618175507, "raw_times": [0.05274196155369282, 0.052841962315142155, 0.05229096859693527, 0.05292100831866264, 0.0526009825989604], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05453097401186824, "peak_bytes": 151027712, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.001495361328125, "mse": 1.0967254638671875e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B1_S2048_D1024", "batch": 1, "seq_len": 2048, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.033741001971066, "p50": 0.03515096614137292, "p90": 0.035751028917729855, "mean": 0.035592797212302685, "iqr": 0.0010509975254535675, "raw_times": [0.033741001971066, 0.03515096614137292, 0.038620957639068365, 0.035751028917729855, 0.03470003139227629], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03478099824860692, "peak_bytes": 41979904, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00157928466796875, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B1_S2048_D2048", "batch": 1, "seq_len": 2048, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.040161015931516886, "p50": 0.04095997428521514, "p90": 0.04124100087210536, "mean": 0.04105480620637536, "iqr": 0.00031996751204133034, "raw_times": [0.040161015931516886, 0.04199100658297539, 0.04124100087210536, 0.04095997428521514, 0.04092103336006403], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04168099258095026, "peak_bytes": 75505664, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B1_S2048_D4096", "batch": 1, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.05262100603431463, "p50": 0.05288200918585062, "p90": 0.0531109981238842, "mean": 0.05337720504030585, "iqr": 0.00027997884899377823, "raw_times": [0.05283101927489042, 0.05262100603431463, 0.05288200918585062, 0.0531109981238842, 0.05544099258258939], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05422096000984311, "peak_bytes": 151011328, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B1_S2048_D8192", "batch": 1, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.2716059680096805, "p50": 0.2742449869401753, "p90": 0.2774460008367896, "mean": 0.2755257999524474, "iqr": 0.0037999707274138927, "raw_times": [0.28068601386621594, 0.2736460301093757, 0.2742449869401753, 0.2716059680096805, 0.2774460008367896], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.27578597655519843, "peak_bytes": 302022656, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1086463928222656e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B4_S128_D1024", "batch": 4, "seq_len": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.030030030757188797, "p50": 0.031871022656559944, "p90": 0.03221101360395551, "mean": 0.03341861302033067, "iqr": 0.00040099257603287697, "raw_times": [0.030030030757188797, 0.03221101360395551, 0.04117097705602646, 0.031871022656559944, 0.03181002102792263], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.034871045500040054, "peak_bytes": 69242880, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.001556396484375, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B4_S128_D2048", "batch": 4, "seq_len": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.02923101419582963, "p50": 0.030590977985411882, "p90": 0.030929979402571917, "mean": 0.030384794808924198, "iqr": 0.0010989606380462646, "raw_times": [0.02923101419582963, 0.03134098369628191, 0.030590977985411882, 0.029831018764525652, 0.030929979402571917], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03408099291846156, "peak_bytes": 18882560, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00154876708984375, "mse": 1.1265277862548828e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B4_S128_D4096", "batch": 4, "seq_len": 128, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.030279974453151226, "p50": 0.03076100256294012, "p90": 0.03130995901301503, "mean": 0.031078385654836893, "iqr": 0.0005599576979875565, "raw_times": [0.03130995901301503, 0.03076100256294012, 0.030279974453151226, 0.03229099093005061, 0.030750001315027475], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03285001730546355, "peak_bytes": 37765120, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015869140625, "mse": 1.1563301086425781e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B4_S128_D8192", "batch": 4, "seq_len": 128, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.03846001345664263, "p50": 0.03899098373949528, "p90": 0.039101054426282644, "mean": 0.039028620813041925, "iqr": 0.00023102620616555214, "raw_times": [0.03899098373949528, 0.039101054426282644, 0.039721024222671986, 0.03887002822011709, 0.03846001345664263], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.040511018596589565, "peak_bytes": 75530240, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0014801025390625, "mse": 1.0848045349121094e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B4_S512_D1024", "batch": 4, "seq_len": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.0330110196955502, "p50": 0.034221040550619364, "p90": 0.03488000947982073, "mean": 0.03425482427701354, "iqr": 0.0009589712135493755, "raw_times": [0.0330110196955502, 0.034221040550619364, 0.03524101339280605, 0.03392103826627135, 0.03488000947982073], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.036880956031382084, "peak_bytes": 37752832, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00157928466796875, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B4_S512_D2048", "batch": 4, "seq_len": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.039421021938323975, "p50": 0.039631035178899765, "p90": 0.041121034882962704, "mean": 0.04027721006423235, "iqr": 0.0015800469554960728, "raw_times": [0.039631035178899765, 0.04167197039350867, 0.041121034882962704, 0.03954098792746663, 0.039421021938323975], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04271004581823945, "peak_bytes": 75505664, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B4_S512_D4096", "batch": 4, "seq_len": 512, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.052101968321949244, "p50": 0.05301198689267039, "p90": 0.053400988690555096, "mean": 0.052991590928286314, "iqr": 0.0005399924702942371, "raw_times": [0.052101968321949244, 0.05358201451599598, 0.05286099622026086, 0.05301198689267039, 0.053400988690555096], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.054280972108244896, "peak_bytes": 151011328, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B4_S512_D8192", "batch": 4, "seq_len": 512, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.2699450124055147, "p50": 0.2703659702092409, "p90": 0.2711050328798592, "mean": 0.27191760018467903, "iqr": 0.0009190407581627369, "raw_times": [0.2703659702092409, 0.2711050328798592, 0.27798599330708385, 0.27018599212169647, 0.2699450124055147], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.27769600274041295, "peak_bytes": 302022656, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1086463928222656e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B4_S1024_D1024", "batch": 4, "seq_len": 1024, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.04519004141911864, "p50": 0.045621010940521955, "p90": 0.04610104952007532, "mean": 0.046770821791142225, "iqr": 0.0005300389602780342, "raw_times": [0.04557101055979729, 0.04610104952007532, 0.045621010940521955, 0.04519004141911864, 0.05137099651619792], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0482110190205276, "peak_bytes": 83922944, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015869140625, "mse": 1.150369644165039e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B4_S1024_D2048", "batch": 4, "seq_len": 1024, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.05651102401316166, "p50": 0.05743099609389901, "p90": 0.05767098627984524, "mean": 0.05834901239722967, "iqr": 0.00033993273973464966, "raw_times": [0.05651102401316166, 0.06280100205913186, 0.05743099609389901, 0.05767098627984524, 0.05733105354011059], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05965103628113866, "peak_bytes": 151003136, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B4_S1024_D4096", "batch": 4, "seq_len": 1024, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.2797759952954948, "p50": 0.2818260109052062, "p90": 0.2828260185196996, "mean": 0.28369000647217035, "iqr": 0.0011200318112969398, "raw_times": [0.2797759952954948, 0.28170598670840263, 0.2828260185196996, 0.2818260109052062, 0.29231602093204856], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.278854975476861, "peak_bytes": 302006272, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00154876708984375, "mse": 1.1265277862548828e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B4_S1024_D8192", "batch": 4, "seq_len": 1024, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.5820420337840915, "p50": 0.589212984777987, "p90": 0.5898119998164475, "mean": 0.5889245891012251, "iqr": 0.002659042365849018, "raw_times": [0.5898119998164475, 0.5871529574505985, 0.589212984777987, 0.5820420337840915, 0.5964029696770012], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.5985530442558229, "peak_bytes": 604012544, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1026859283447266e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B4_S2048_D1024", "batch": 4, "seq_len": 2048, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.06898096762597561, "p50": 0.07057201582938433, "p90": 0.07107201963663101, "mean": 0.07093560416251421, "iqr": 0.0007100170478224754, "raw_times": [0.07369101513177156, 0.07036200258880854, 0.07107201963663101, 0.06898096762597561, 0.07057201582938433], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.06987096276134253, "peak_bytes": 167809024, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015869140625, "mse": 1.1563301086425781e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B4_S2048_D2048", "batch": 4, "seq_len": 2048, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.2780960057862103, "p50": 0.2848859876394272, "p90": 0.28691597981378436, "mean": 0.2848202013410628, "iqr": 0.003049965016543865, "raw_times": [0.2838660147972405, 0.28691597981378436, 0.2848859876394272, 0.2903370186686516, 0.2780960057862103], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.27952599339187145, "peak_bytes": 301998080, "ok": false, "absmax": 0.0625, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.0625, "mae": 0.0015716552734375, "mse": 1.150369644165039e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B4_S2048_D4096", "batch": 4, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.5754730082117021, "p50": 0.5831019952893257, "p90": 0.5851630121469498, "mean": 0.582532596308738, "iqr": 0.004841014742851257, "raw_times": [0.5831019952893257, 0.5851630121469498, 0.5803219974040985, 0.5886029684916139, 0.5754730082117021], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.5842720274813473, "peak_bytes": 603996160, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B4_S2048_D8192", "batch": 4, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 1.1429850128479302, "p50": 1.1474639759398997, "p90": 1.1484349961392581, "mean": 1.1493865866214037, "iqr": 0.001491047441959381, "raw_times": [1.1474639759398997, 1.1429850128479302, 1.1484349961392581, 1.1469439486972988, 1.1611049994826317], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.1680549941956997, "peak_bytes": 1207992320, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1026859283447266e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S128_D1024", "batch": 16, "seq_len": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.03370095510035753, "p50": 0.03467098576948047, "p90": 0.03499101148918271, "mean": 0.034558994229882956, "iqr": 0.0006700283847749233, "raw_times": [0.03432098310440779, 0.03467098576948047, 0.03499101148918271, 0.03511103568598628, 0.03370095510035753], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.035281002055853605, "peak_bytes": 276860928, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00157928466796875, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S128_D2048", "batch": 16, "seq_len": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.04038098268210888, "p50": 0.04121096571907401, "p90": 0.041480991058051586, "mean": 0.04112699534744024, "iqr": 0.0006599584594368935, "raw_times": [0.04038098268210888, 0.04121096571907401, 0.04082103259861469, 0.041741004679352045, 0.041480991058051586], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04577101208269596, "peak_bytes": 75505664, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S128_D4096", "batch": 16, "seq_len": 128, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.0530310207977891, "p50": 0.05350098945200443, "p90": 0.05359097849577665, "mean": 0.053398997988551855, "iqr": 0.0005399924702942371, "raw_times": [0.05350098945200443, 0.053050986025482416, 0.0530310207977891, 0.05359097849577665, 0.053821015171706676], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.062031031120568514, "peak_bytes": 151011328, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S128_D8192", "batch": 16, "seq_len": 128, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.26927603175863624, "p50": 0.27200503973290324, "p90": 0.27223501820117235, "mean": 0.2713776077143848, "iqr": 0.002459040842950344, "raw_times": [0.27200503973290324, 0.27359597152099013, 0.26927603175863624, 0.27223501820117235, 0.269775977358222], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2747260150499642, "peak_bytes": 302022656, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1086463928222656e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S512_D1024", "batch": 16, "seq_len": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.06719096563756466, "p50": 0.07005198858678341, "p90": 0.07023202488198876, "mean": 0.06946560461074114, "iqr": 0.0011699739843606949, "raw_times": [0.06719096563756466, 0.07005198858678341, 0.06906205089762807, 0.07079099304974079, 0.07023202488198876], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.07167202420532703, "peak_bytes": 150999040, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015869140625, "mse": 1.1563301086425781e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S512_D2048", "batch": 16, "seq_len": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.2804459654726088, "p50": 0.28543599182739854, "p90": 0.28567598201334476, "mean": 0.2847379771992564, "iqr": 0.0013799872249364853, "raw_times": [0.28543599182739854, 0.2878359518945217, 0.2842959947884083, 0.28567598201334476, 0.2804459654726088], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.28216600185260177, "peak_bytes": 301998080, "ok": false, "absmax": 0.0625, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.0625, "mae": 0.0015716552734375, "mse": 1.150369644165039e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S512_D4096", "batch": 16, "seq_len": 512, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.5787130212411284, "p50": 0.5883330013602972, "p90": 0.5898720119148493, "mean": 0.5873608053661883, "iqr": 0.0030189985409379005, "raw_times": [0.5930329789407551, 0.5868530133739114, 0.5898720119148493, 0.5787130212411284, 0.5883330013602972], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.5958119872957468, "peak_bytes": 603996160, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S512_D8192", "batch": 16, "seq_len": 512, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 1.1475840001367033, "p50": 1.149774994701147, "p90": 1.149774994701147, "mean": 1.149676798377186, "iqr": 0.0017299898900091648, "raw_times": [1.148045004811138, 1.149774994701147, 1.149774994701147, 1.153204997535795, 1.1475840001367033], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.1586649925448, "peak_bytes": 1207992320, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1026859283447266e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S1024_D1024", "batch": 16, "seq_len": 1024, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.28807600028812885, "p50": 0.29028597055003047, "p90": 0.2923660213127732, "mean": 0.290472200140357, "iqr": 0.0033390242606401443, "raw_times": [0.2923660213127732, 0.29028597055003047, 0.2890269970521331, 0.28807600028812885, 0.29260601149871945], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.28624600963667035, "peak_bytes": 335581184, "ok": false, "absmax": 0.0625, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.0625, "mae": 0.0015869140625, "mse": 1.1563301086425781e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:01Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S1024_D2048", "batch": 16, "seq_len": 1024, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.5808720015920699, "p50": 0.5865029525011778, "p90": 0.5889830063097179, "mean": 0.5862265941686928, "iqr": 0.0050209928303956985, "raw_times": [0.5808720015920699, 0.5865029525011778, 0.5908129969611764, 0.5889830063097179, 0.5839620134793222], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.5870030145160854, "peak_bytes": 603987968, "ok": false, "absmax": 0.0625, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.0625, "mae": 0.0015716552734375, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:02Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S1024_D4096", "batch": 16, "seq_len": 1024, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 1.1516850208863616, "p50": 1.1541039566509426, "p90": 1.159774954430759, "mean": 1.1568425805307925, "iqr": 0.006380956619977951, "raw_times": [1.1652549728751183, 1.153393997810781, 1.1541039566509426, 1.159774954430759, 1.1516850208863616], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.1652849498204887, "peak_bytes": 1207975936, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00154876708984375, "mse": 1.1265277862548828e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:02Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S1024_D8192", "batch": 16, "seq_len": 1024, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 2.264778013341129, "p50": 2.2672080085612833, "p90": 2.2703579743392766, "mean": 2.2687464021146297, "iqr": 0.0047089415602386, "raw_times": [2.2703579743392766, 2.264778013341129, 2.2672080085612833, 2.265649032779038, 2.275738981552422], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 2.299049054272473, "peak_bytes": 2415951872, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00150299072265625, "mse": 1.0967254638671875e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:02Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D1024", "batch": 16, "seq_len": 2048, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.5862729740329087, "p50": 0.5921830306760967, "p90": 0.5925330333411694, "mean": 0.591674807947129, "iqr": 0.00036100391298532486, "raw_times": [0.592172029428184, 0.5921830306760967, 0.5925330333411694, 0.5952129722572863, 0.5862729740329087], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.5872620386071503, "peak_bytes": 671125504, "ok": false, "absmax": 0.0625, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.0625, "mae": 0.0015869140625, "mse": 1.1563301086425781e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:02Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D2048", "batch": 16, "seq_len": 2048, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 1.150664989836514, "p50": 1.156534010078758, "p90": 1.1567150359041989, "mean": 1.15486680297181, "iqr": 0.004480069037526846, "raw_times": [1.1567150359041989, 1.150664989836514, 1.1581850121729076, 1.156534010078758, 1.152234966866672], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.150145020801574, "peak_bytes": 1207967744, "ok": false, "absmax": 0.0625, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.0625, "mae": 0.00156402587890625, "mse": 1.138448715209961e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:02Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D4096", "batch": 16, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 2.2770979558117688, "p50": 2.2999990032985806, "p90": 2.302108972799033, "mean": 2.2958547924645245, "iqr": 0.012759934179484844, "raw_times": [2.2770979558117688, 2.302108972799033, 2.310718991793692, 2.2999990032985806, 2.289349038619548], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 2.3001389927230775, "peak_bytes": 2415935488, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1265277862548828e-05, "ref": "layer_norm_fp32"}, "err": null} -{"ts": "2025-10-24T19:24:02Z", "run": "fba0677bba464dcfbc87714fa9385ac6", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D8192", "batch": 16, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 4.500485956668854, "p50": 4.510977014433593, "p90": 4.513906955253333, "mean": 4.509088769555092, "iqr": 0.010930001735687256, "raw_times": [4.500485956668854, 4.510977014433593, 4.502976953517646, 4.513906955253333, 4.5170969679020345], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 4.5062569552101195, "peak_bytes": 4831870976, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1086463928222656e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S128_D1024", "batch": 1, "seq_len": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.03865100006805733, "p50": 0.03903099991475756, "p90": 0.04018100003122527, "mean": 0.03959079995183856, "iqr": 0.001300000121773337, "raw_times": [0.03888099990945193, 0.03903099991475756, 0.04018100003122527, 0.04120999983570073, 0.03865100006805733], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05060100011178292, "peak_bytes": 2363392, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S128_D2048", "batch": 1, "seq_len": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04517999991549004, "p50": 0.04712100007964182, "p90": 0.04805000003216264, "mean": 0.04695459997492435, "iqr": 0.001779000058377278, "raw_times": [0.04517999991549004, 0.04805000003216264, 0.04712100007964182, 0.046270999973785365, 0.04815099987354188], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05504099999598111, "peak_bytes": 4726784, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.1324882507324219e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S128_D4096", "batch": 1, "seq_len": 128, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04581999996844388, "p50": 0.04766099982589367, "p90": 0.04786099998455029, "mean": 0.047156599976005964, "iqr": 0.0017899999420478707, "raw_times": [0.04766099982589367, 0.04786099998455029, 0.04837000005863956, 0.04607100004250242, 0.04581999996844388], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05361099988476781, "peak_bytes": 9453568, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00146484375, "mse": 1.049041748046875e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S128_D8192", "batch": 1, "seq_len": 128, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.045190999799160636, "p50": 0.04684100008489622, "p90": 0.04752099994220771, "mean": 0.046596999982284615, "iqr": 0.00227999998969608, "raw_times": [0.04524099995251163, 0.04752099994220771, 0.04819100013264688, 0.04684100008489622, 0.045190999799160636], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.052801000038016355, "peak_bytes": 18907136, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0016326904296875, "mse": 1.1801719665527344e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S512_D1024", "batch": 1, "seq_len": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04395000019030704, "p50": 0.045061000037094345, "p90": 0.046920999920985196, "mean": 0.04563460001918429, "iqr": 0.0018609998733154498, "raw_times": [0.04718099989986513, 0.046920999920985196, 0.045060000047669746, 0.045061000037094345, 0.04395000019030704], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05090100012239418, "peak_bytes": 9441280, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.138448715209961e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S512_D2048", "batch": 1, "seq_len": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04656000010072603, "p50": 0.046920999920985196, "p90": 0.04878100003224972, "mean": 0.04884479999418545, "iqr": 0.0020300001324358163, "raw_times": [0.04656000010072603, 0.046750999899813905, 0.04878100003224972, 0.0552110000171524, 0.046920999920985196], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0497399998948822, "peak_bytes": 18882560, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00154876708984375, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S512_D4096", "batch": 1, "seq_len": 512, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04567099995256285, "p50": 0.04622100004780805, "p90": 0.04798100007974426, "mean": 0.047496800016233465, "iqr": 0.0018200000795332016, "raw_times": [0.04567099995256285, 0.0514500000008411, 0.04616100000021106, 0.04798100007974426, 0.04622100004780805], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04885000021204178, "peak_bytes": 37765120, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S512_D8192", "batch": 1, "seq_len": 512, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04487000001063279, "p50": 0.045961000068928115, "p90": 0.046200000042517786, "mean": 0.04860060003011313, "iqr": 0.000509000074089272, "raw_times": [0.06028100006005843, 0.04487000001063279, 0.045690999968428514, 0.045961000068928115, 0.046200000042517786], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05061100000602892, "peak_bytes": 75530240, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00157928466796875, "mse": 1.150369644165039e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S1024_D1024", "batch": 1, "seq_len": 1024, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.043511000058060745, "p50": 0.046270999973785365, "p90": 0.04790999992110301, "mean": 0.047574600012012525, "iqr": 0.002919999815276242, "raw_times": [0.044990000105826766, 0.04790999992110301, 0.043511000058060745, 0.05519100000128674, 0.046270999973785365], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.048970999841913, "peak_bytes": 21008384, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00157928466796875, "mse": 1.138448715209961e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S1024_D2048", "batch": 1, "seq_len": 1024, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.043170000026293565, "p50": 0.04767099994751334, "p90": 0.0476899999739544, "mean": 0.04691639996963204, "iqr": 0.0009390000741404947, "raw_times": [0.043170000026293565, 0.04930000000058499, 0.04767099994751334, 0.046750999899813905, 0.0476899999739544], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05084099984742352, "peak_bytes": 37756928, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S1024_D4096", "batch": 1, "seq_len": 1024, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.044720999994751764, "p50": 0.045860000000175205, "p90": 0.046411000084845, "mean": 0.04585680003401649, "iqr": 0.0012000000424450263, "raw_times": [0.044720999994751764, 0.04708100004791049, 0.046411000084845, 0.045860000000175205, 0.045211000042399974], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05302099998516496, "peak_bytes": 75513856, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0016021728515625, "mse": 1.1682510375976562e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S1024_D8192", "batch": 1, "seq_len": 1024, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04476999993130448, "p50": 0.04614999988916679, "p90": 0.04633100002138235, "mean": 0.04639259996110923, "iqr": 0.00019000003703695256, "raw_times": [0.04476999993130448, 0.04614999988916679, 0.04857099997934711, 0.0461409999843454, 0.04633100002138235], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.047730999995110324, "peak_bytes": 151027712, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00156402587890625, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S2048_D1024", "batch": 1, "seq_len": 2048, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04606099992088275, "p50": 0.046679999968546326, "p90": 0.04687099999500788, "mean": 0.0466285999664251, "iqr": 0.0006509999366244301, "raw_times": [0.04606099992088275, 0.047310999889305094, 0.04687099999500788, 0.046679999968546326, 0.04622000005838345], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.050389999842082034, "peak_bytes": 41979904, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00159454345703125, "mse": 1.150369644165039e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S2048_D2048", "batch": 1, "seq_len": 2048, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04560100001071987, "p50": 0.04617999979927845, "p90": 0.04656999999497202, "mean": 0.0462445999346528, "iqr": 0.0007090000053722179, "raw_times": [0.04617999979927845, 0.045860999989599804, 0.04560100001071987, 0.04701099987869384, 0.04656999999497202], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.049061000026995316, "peak_bytes": 75505664, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00159454345703125, "mse": 1.1563301086425781e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S2048_D4096", "batch": 1, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04474100001061743, "p50": 0.04615000011654047, "p90": 0.04696099995271652, "mean": 0.046176800060493406, "iqr": 0.0009599998520570807, "raw_times": [0.047031000121933175, 0.04474100001061743, 0.04600100010065944, 0.04615000011654047, 0.04696099995271652], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.051490000032572425, "peak_bytes": 151011328, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00156402587890625, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S2048_D8192", "batch": 1, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.051341000016691396, "p50": 0.05155100006959401, "p90": 0.05226099983701715, "mean": 0.051880799992432, "iqr": 0.0007709998044447275, "raw_times": [0.051341000016691396, 0.051490000032572425, 0.05226099983701715, 0.05155100006959401, 0.05276100000628503], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.053531000048678834, "peak_bytes": 302022656, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00152587890625, "mse": 1.1146068572998047e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S128_D1024", "batch": 4, "seq_len": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.044059999936507666, "p50": 0.04549100003714557, "p90": 0.045540999963122886, "mean": 0.04540859999906388, "iqr": 0.0004099999841855606, "raw_times": [0.04549100003714557, 0.044059999936507666, 0.04682000007960596, 0.045130999978937325, 0.045540999963122886], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.048860999868338695, "peak_bytes": 69242880, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.138448715209961e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S128_D2048", "batch": 4, "seq_len": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04604099990501709, "p50": 0.04642099997909099, "p90": 0.04698099996858218, "mean": 0.05290099998092046, "iqr": 0.0009299999419454252, "raw_times": [0.07901100002527528, 0.04698099996858218, 0.04604099990501709, 0.04605100002663676, 0.04642099997909099], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.048481000021638465, "peak_bytes": 18882560, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00154876708984375, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S128_D4096", "batch": 4, "seq_len": 128, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04422100005285756, "p50": 0.045961000068928115, "p90": 0.04607100004250242, "mean": 0.04557280003609776, "iqr": 0.0010700000530050602, "raw_times": [0.04500099998949736, 0.045961000068928115, 0.04422100005285756, 0.046610000026703347, 0.04607100004250242], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05000100009056041, "peak_bytes": 37765120, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S128_D8192", "batch": 4, "seq_len": 128, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.044550999973580474, "p50": 0.04615100010596507, "p90": 0.04661999992094934, "mean": 0.04619880000973353, "iqr": 0.0006089999260439072, "raw_times": [0.04661999992094934, 0.047661000053267344, 0.04615100010596507, 0.04601099999490543, 0.044550999973580474], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05021999982091074, "peak_bytes": 75530240, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00157928466796875, "mse": 1.150369644165039e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S512_D1024", "batch": 4, "seq_len": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04479100016396842, "p50": 0.04570999999486958, "p90": 0.04578100015351083, "mean": 0.04546060008578934, "iqr": 0.0006410000423784368, "raw_times": [0.045881000005465467, 0.04578100015351083, 0.045140000111132395, 0.04479100016396842, 0.04570999999486958], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05074099999546888, "peak_bytes": 37752832, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00159454345703125, "mse": 1.150369644165039e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S512_D2048", "batch": 4, "seq_len": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04388000002109038, "p50": 0.046260999852165696, "p90": 0.047070999926290824, "mean": 0.046070799999142764, "iqr": 0.0010899998414970469, "raw_times": [0.04716100011137314, 0.04598100008479378, 0.04388000002109038, 0.046260999852165696, 0.047070999926290824], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05007100003240339, "peak_bytes": 75505664, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00159454345703125, "mse": 1.1563301086425781e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S512_D4096", "batch": 4, "seq_len": 512, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04435100004229753, "p50": 0.045130999978937325, "p90": 0.04698099996858218, "mean": 0.04562479998639901, "iqr": 0.0023600000531587284, "raw_times": [0.044620999915423454, 0.04698099996858218, 0.04704000002675457, 0.04435100004229753, 0.045130999978937325], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04849099991588446, "peak_bytes": 151011328, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00156402587890625, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S512_D8192", "batch": 4, "seq_len": 512, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05119000002196117, "p50": 0.05123000005369249, "p90": 0.05150099991624302, "mean": 0.051574400004028575, "iqr": 0.00027999999474559445, "raw_times": [0.05122099992149742, 0.05150099991624302, 0.05123000005369249, 0.052730000106748776, 0.05119000002196117], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05633099999613478, "peak_bytes": 302022656, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00152587890625, "mse": 1.1146068572998047e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S1024_D1024", "batch": 4, "seq_len": 1024, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04580100016937649, "p50": 0.04708999995273189, "p90": 0.04770099985762499, "mean": 0.05188039999666216, "iqr": 0.00096099984148168, "raw_times": [0.07206999998743413, 0.04674000001614331, 0.04580100016937649, 0.04708999995273189, 0.04770099985762499], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04944000011164462, "peak_bytes": 83922944, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015869140625, "mse": 1.150369644165039e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S1024_D2048", "batch": 4, "seq_len": 1024, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04320099992582982, "p50": 0.04512100008469133, "p90": 0.04604099990501709, "mean": 0.04527500000222062, "iqr": 0.001329999804511317, "raw_times": [0.04320099992582982, 0.04604099990501709, 0.04471100010050577, 0.0473009999950591, 0.04512100008469133], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.051181000117139774, "peak_bytes": 151003136, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S1024_D4096", "batch": 4, "seq_len": 1024, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04984999986845651, "p50": 0.050290999979552, "p90": 0.050490999910834944, "mean": 0.050288599959458224, "iqr": 0.0005399999736255268, "raw_times": [0.04995099993720942, 0.050490999910834944, 0.050290999979552, 0.050860000101238256, 0.04984999986845651], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.052241000048525166, "peak_bytes": 302006272, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S1024_D8192", "batch": 4, "seq_len": 1024, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2064129998871067, "p50": 0.2123330000358692, "p90": 0.218262999851504, "mean": 0.2148927999769512, "iqr": 0.010130999726243317, "raw_times": [0.20813200012526067, 0.218262999851504, 0.2123330000358692, 0.2064129998871067, 0.22932299998501549], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21481299995684822, "peak_bytes": 604012544, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00152587890625, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S2048_D1024", "batch": 4, "seq_len": 2048, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04353000008450181, "p50": 0.04543000000012398, "p90": 0.04657099998439662, "mean": 0.04557060001388891, "iqr": 0.001390000079481979, "raw_times": [0.04543000000012398, 0.04518099990491464, 0.04714100009550748, 0.04657099998439662, 0.04353000008450181], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0490809998154873, "peak_bytes": 167809024, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015869140625, "mse": 1.150369644165039e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S2048_D2048", "batch": 4, "seq_len": 2048, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.054420999958892935, "p50": 0.05506100001184677, "p90": 0.055460999874412664, "mean": 0.055042999929355574, "iqr": 0.0008699998943484388, "raw_times": [0.054420999958892935, 0.055460999874412664, 0.05568099982156127, 0.054590999980064225, 0.05506100001184677], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05802099985885434, "peak_bytes": 301998080, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00157928466796875, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S2048_D4096", "batch": 4, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.20996300008846447, "p50": 0.2102230000673444, "p90": 0.21053299997220165, "mean": 0.21050080003988114, "iqr": 0.0004209998678561533, "raw_times": [0.20996300008846447, 0.21053299997220165, 0.2102230000673444, 0.2101120001043455, 0.2116729999670497], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21157300011509506, "peak_bytes": 603996160, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1265277862548828e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S2048_D8192", "batch": 4, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.4341660001045966, "p50": 0.4372359999251785, "p90": 0.4383160000998032, "mean": 0.437980000015159, "iqr": 0.004120000085094944, "raw_times": [0.4383160000998032, 0.4372359999251785, 0.4341660001045966, 0.43419600001470826, 0.44598599993150856], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.44448700009525055, "peak_bytes": 1207992320, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00152587890625, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S128_D1024", "batch": 16, "seq_len": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04514099987318332, "p50": 0.0465299999632407, "p90": 0.04655099996853096, "mean": 0.04629059999388119, "iqr": 0.0011309998626529705, "raw_times": [0.04514099987318332, 0.04655099996853096, 0.04781100005857297, 0.04542000010587799, 0.0465299999632407], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04811100006918423, "peak_bytes": 276860928, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00159454345703125, "mse": 1.150369644165039e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S128_D2048", "batch": 16, "seq_len": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.043820999962917995, "p50": 0.045551000084742554, "p90": 0.04633000003195775, "mean": 0.04580079998959263, "iqr": 0.0007890000688348664, "raw_times": [0.043820999962917995, 0.04776099990522198, 0.045551000084742554, 0.045540999963122886, 0.04633000003195775], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05054000007476134, "peak_bytes": 75505664, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00159454345703125, "mse": 1.1563301086425781e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S128_D4096", "batch": 16, "seq_len": 128, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04511099996307166, "p50": 0.04610100017998775, "p90": 0.04624100006367371, "mean": 0.04598500008796691, "iqr": 0.0004099999841855606, "raw_times": [0.04583100007948815, 0.04664100015361328, 0.04511099996307166, 0.04610100017998775, 0.04624100006367371], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04932000001645065, "peak_bytes": 151011328, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00156402587890625, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S128_D8192", "batch": 16, "seq_len": 128, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05165100014892232, "p50": 0.052130999847577186, "p90": 0.05317099999047059, "mean": 0.05250480003269331, "iqr": 0.0012309999419812812, "raw_times": [0.052130999847577186, 0.05165100014892232, 0.053631000128007145, 0.05317099999047059, 0.05194000004848931], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.055000999964249786, "peak_bytes": 302022656, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00152587890625, "mse": 1.1146068572998047e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S512_D1024", "batch": 16, "seq_len": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.045381000063571264, "p50": 0.045759999920846894, "p90": 0.04781100005857297, "mean": 0.04770240002471837, "iqr": 0.00238100005844899, "raw_times": [0.045759999920846894, 0.04781100005857297, 0.045381000063571264, 0.04543000000012398, 0.05413000008047675], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04919000002701068, "peak_bytes": 150999040, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015869140625, "mse": 1.150369644165039e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S512_D2048", "batch": 16, "seq_len": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05421099990599032, "p50": 0.054861000080563826, "p90": 0.05564100001720362, "mean": 0.05508300000656163, "iqr": 0.0010100000054080738, "raw_times": [0.056071000017254846, 0.054861000080563826, 0.05564100001720362, 0.05421099990599032, 0.05463100001179555], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05805000000691507, "peak_bytes": 301998080, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00157928466796875, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S512_D4096", "batch": 16, "seq_len": 512, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.20916299990858533, "p50": 0.21016300001974741, "p90": 0.21141399997759436, "mean": 0.21107719999235997, "iqr": 0.0015210000583465444, "raw_times": [0.21141399997759436, 0.2147530001366249, 0.2098929999192478, 0.21016300001974741, 0.20916299990858533], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21191299993006396, "peak_bytes": 603996160, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1265277862548828e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S512_D8192", "batch": 16, "seq_len": 512, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.43155599996680394, "p50": 0.43475600000419945, "p90": 0.4373360000045068, "mean": 0.43558200000006764, "iqr": 0.003800000058618025, "raw_times": [0.43475600000419945, 0.44072600007893925, 0.4373360000045068, 0.43353599994588876, 0.43155599996680394], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.44892699997944874, "peak_bytes": 1207992320, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00152587890625, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S1024_D1024", "batch": 16, "seq_len": 1024, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0483500000427739, "p50": 0.049099999841928366, "p90": 0.04950099992129253, "mean": 0.050544599935165024, "iqr": 0.0011199999789823778, "raw_times": [0.048380999942310154, 0.04950099992129253, 0.05739099992752017, 0.049099999841928366, 0.0483500000427739], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05153099982635467, "peak_bytes": 335581184, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015869140625, "mse": 1.150369644165039e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S1024_D2048", "batch": 16, "seq_len": 1024, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2181429999836837, "p50": 0.2215729998624738, "p90": 0.2217329999893991, "mean": 0.22086119997766218, "iqr": 0.003440000000409782, "raw_times": [0.2181429999836837, 0.2217329999893991, 0.21829299998898932, 0.2215729998624738, 0.22456400006376498], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22583300005862839, "peak_bytes": 603987968, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S1024_D4096", "batch": 16, "seq_len": 1024, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.43596600016826415, "p50": 0.4398270000365301, "p90": 0.4409260000102222, "mean": 0.4390922000766295, "iqr": 0.003549999973984086, "raw_times": [0.4398270000365301, 0.4409260000102222, 0.4413660001318931, 0.43596600016826415, 0.4373760000362381], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.44040700004188693, "peak_bytes": 1207975936, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S1024_D8192", "batch": 16, "seq_len": 1024, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.8329219999723136, "p50": 0.8419220000632777, "p90": 0.8434520000264456, "mean": 0.84072780000497, "iqr": 0.002130000211764127, "raw_times": [0.8329219999723136, 0.8419220000632777, 0.8440210001481319, 0.8434520000264456, 0.8413219998146815], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.8442119999472197, "peak_bytes": 2415951872, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00151824951171875, "mse": 1.1146068572998047e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D1024", "batch": 16, "seq_len": 2048, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21230300012575753, "p50": 0.2135429999725602, "p90": 0.2142630000889767, "mean": 0.21426700000120036, "iqr": 0.0008800002433417831, "raw_times": [0.21230300012575753, 0.2133829998456349, 0.2135429999725602, 0.2142630000889767, 0.21784299997307244], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22175300000526477, "peak_bytes": 671125504, "ok": false, "absmax": 0.0625, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.0625, "mae": 0.0015869140625, "mse": 1.1563301086425781e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D2048", "batch": 16, "seq_len": 2048, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.4536460000963416, "p50": 0.45670700001210207, "p90": 0.4569770001126017, "mean": 0.45669080004699936, "iqr": 0.00113999999484804, "raw_times": [0.4536460000963416, 0.4569770001126017, 0.45583700011775363, 0.45670700001210207, 0.4602869998961978], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.4546860000118613, "peak_bytes": 1207967744, "ok": false, "absmax": 0.0625, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.0625, "mae": 0.0015716552734375, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D4096", "batch": 16, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.8352710001418018, "p50": 0.8370320001631626, "p90": 0.8388319999994565, "mean": 0.8375798000997747, "iqr": 0.0019899998733308166, "raw_times": [0.8352710001418018, 0.8368420001261256, 0.8399220000683272, 0.8370320001631626, 0.8388319999994565], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.849921000053655, "peak_bytes": 2415935488, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null} +{"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D8192", "batch": 16, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6451530000267667, "p50": 1.6546740000649152, "p90": 1.6553830000702874, "mean": 1.6516054000476288, "iqr": 0.008870000101524056, "raw_times": [1.6553830000702874, 1.6465129999687633, 1.6563040001074114, 1.6546740000649152, 1.6451530000267667], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.655194000022675, "peak_bytes": 4831870976, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1086463928222656e-05, "ref": "layer_norm_fp32"}, "err": null}