{"ts": "2025-10-29T04:14:27Z", "run": "294e7842944942f68c6d635847dc1d1f", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T128_D768", "num_tokens": 128, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.039330999982212234, "p50": 0.04005099998494188, "p90": 0.04157099999702041, "mean": 0.040440999998736515, "iqr": 0.0020999999605919584, "raw_times": [0.03947100003642845, 0.04157099999702041, 0.04005099998494188, 0.041780999993079604, 0.039330999982212234], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.047832000007019815, "peak_bytes": 1966080, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} {"ts": "2025-10-29T04:14:27Z", "run": "294e7842944942f68c6d635847dc1d1f", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T128_D1024", "num_tokens": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0515919999770631, "p50": 0.05179099997576486, "p90": 0.05224099999168175, "mean": 0.05211119997738933, "iqr": 0.0006300000450210064, "raw_times": [0.05224099999168175, 0.0515919999770631, 0.05161099994666074, 0.05179099997576486, 0.05332099999577622], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0555309999867859, "peak_bytes": 2621440, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} {"ts": "2025-10-29T04:14:27Z", "run": "294e7842944942f68c6d635847dc1d1f", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T128_D2048", "num_tokens": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04996100000198567, "p50": 0.05194099998107049, "p90": 0.05195099998900332, "mean": 0.05124099998283782, "iqr": 0.0016000000186977559, "raw_times": [0.05194099998107049, 0.05195099998900332, 0.04996100000198567, 0.05200099997182406, 0.050350999970305566], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05537100003039086, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} {"ts": "2025-10-29T04:14:27Z", "run": "294e7842944942f68c6d635847dc1d1f", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T256_D768", "num_tokens": 256, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04920100002436811, "p50": 0.05169100001012339, "p90": 0.05200099997182406, "mean": 0.051318999987870484, "iqr": 0.000339999985499162, "raw_times": [0.051660999986324896, 0.05204099994671196, 0.04920100002436811, 0.05200099997182406, 0.05169100001012339], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.055880999980217894, "peak_bytes": 3932160, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} {"ts": "2025-10-29T04:14:27Z", "run": "294e7842944942f68c6d635847dc1d1f", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T256_D1024", "num_tokens": 256, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04919000002701068, "p50": 0.05105200000343757, "p90": 0.05142099996646721, "mean": 0.050994999992326484, "iqr": 0.0005200000146032835, "raw_times": [0.05090099995186392, 0.05241100001285304, 0.05142099996646721, 0.04919000002701068, 0.05105200000343757], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.054681999984040885, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} {"ts": "2025-10-29T04:14:27Z", "run": "294e7842944942f68c6d635847dc1d1f", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T256_D2048", "num_tokens": 256, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04663100003199361, "p50": 0.05066099998884965, "p90": 0.05077099996242396, "mean": 0.049591000004056696, "iqr": 0.0016599999526079046, "raw_times": [0.04911100000981605, 0.05077099996242396, 0.04663100003199361, 0.05078100002720021, 0.05066099998884965], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05457200001046658, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} {"ts": "2025-10-29T04:14:27Z", "run": "294e7842944942f68c6d635847dc1d1f", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T512_D768", "num_tokens": 512, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04728099997919344, "p50": 0.050772000008691975, "p90": 0.051271000018005, "mean": 0.04967720000195186, "iqr": 0.003820000017640268, "raw_times": [0.04728099997919344, 0.051271000018005, 0.05161100000350416, 0.050772000008691975, 0.04745100000036473], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05381099998658101, "peak_bytes": 7864320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} {"ts": "2025-10-29T04:14:27Z", "run": "294e7842944942f68c6d635847dc1d1f", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T512_D1024", "num_tokens": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04900099997939833, "p50": 0.04957199996624695, "p90": 0.05115100003649786, "mean": 0.05033119999779956, "iqr": 0.001620000034563418, "raw_times": [0.04900099997939833, 0.052401000004920206, 0.04957199996624695, 0.05115100003649786, 0.049531000001934444], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05343100002619394, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} {"ts": "2025-10-29T04:14:27Z", "run": "294e7842944942f68c6d635847dc1d1f", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T512_D2048", "num_tokens": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04688100000294071, "p50": 0.04992099997025434, "p90": 0.05054100000734252, "mean": 0.049500799991619715, "iqr": 0.0023510000346504967, "raw_times": [0.04688100000294071, 0.04992099997025434, 0.05054100000734252, 0.04818999997269202, 0.051971000004868984], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05505100000391394, "peak_bytes": 20971520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}