| {"ts": "2025-11-10T22:11:36Z", "run": "b81f5729b90144f29ef4b2b3f014bb6b", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T128_D768", "num_tokens": 128, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04274000002624234, "p50": 0.043191000031583826, "p90": 0.04467100006877445, "mean": 0.04373860001578578, "iqr": 0.0017300001218245598, "raw_times": [0.04467100006877445, 0.04515000000537839, 0.043191000031583826, 0.04274000002624234, 0.04294099994694989], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04910000006930204, "peak_bytes": 1966080, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} | |
| {"ts": "2025-11-10T22:11:36Z", "run": "b81f5729b90144f29ef4b2b3f014bb6b", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T128_D1024", "num_tokens": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.048549999974056846, "p50": 0.049830999842015444, "p90": 0.05033100001128332, "mean": 0.04977279995728168, "iqr": 0.0006400000529538374, "raw_times": [0.048549999974056846, 0.049690999958329485, 0.05033100001128332, 0.05046100000072329, 0.049830999842015444], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05312100006449327, "peak_bytes": 2621440, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} | |
| {"ts": "2025-11-10T22:11:36Z", "run": "b81f5729b90144f29ef4b2b3f014bb6b", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T128_D2048", "num_tokens": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.049481000132800546, "p50": 0.04955999997946492, "p90": 0.04985100008525478, "mean": 0.049792600020737154, "iqr": 0.000360000058208243, "raw_times": [0.04955999997946492, 0.050579999879118986, 0.049481000132800546, 0.04985100008525478, 0.04949100002704654], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.052620999895225395, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} | |
| {"ts": "2025-11-10T22:11:36Z", "run": "b81f5729b90144f29ef4b2b3f014bb6b", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T256_D768", "num_tokens": 256, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04747100001623039, "p50": 0.049561000196263194, "p90": 0.04995100016458309, "mean": 0.04936700006510364, "iqr": 0.0008900001375877764, "raw_times": [0.04747100001623039, 0.049561000196263194, 0.0507909999214462, 0.049061000026995316, 0.04995100016458309], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0509510000483715, "peak_bytes": 3932160, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} | |
| {"ts": "2025-11-10T22:11:36Z", "run": "b81f5729b90144f29ef4b2b3f014bb6b", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T256_D1024", "num_tokens": 256, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04721999994217185, "p50": 0.04802100011147559, "p90": 0.048511000159123796, "mean": 0.0482608000311302, "iqr": 0.0008600002274761209, "raw_times": [0.04802100011147559, 0.04721999994217185, 0.048511000159123796, 0.0499010000112321, 0.047650999931647675], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.051911000127802254, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} | |
| {"ts": "2025-11-10T22:11:37Z", "run": "b81f5729b90144f29ef4b2b3f014bb6b", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T256_D2048", "num_tokens": 256, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04553000007945229, "p50": 0.047661000053267344, "p90": 0.04845100011152681, "mean": 0.049852800020744326, "iqr": 0.0010610001481836662, "raw_times": [0.04553000007945229, 0.04738999996334314, 0.047661000053267344, 0.04845100011152681, 0.06023199989613204], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04891099979431601, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} | |
| {"ts": "2025-11-10T22:11:37Z", "run": "b81f5729b90144f29ef4b2b3f014bb6b", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T512_D768", "num_tokens": 512, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04606099992088275, "p50": 0.04722100015897013, "p90": 0.047730999995110324, "mean": 0.04745279998132901, "iqr": 0.0006210000265127746, "raw_times": [0.04606099992088275, 0.04914099986308429, 0.04722100015897013, 0.047730999995110324, 0.04710999996859755], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05060099988440925, "peak_bytes": 7864320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} | |
| {"ts": "2025-11-10T22:11:37Z", "run": "b81f5729b90144f29ef4b2b3f014bb6b", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T512_D1024", "num_tokens": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.047480999910476385, "p50": 0.04807099981007923, "p90": 0.04905100013274932, "mean": 0.049742999999580206, "iqr": 0.0014700001429446274, "raw_times": [0.047480999910476385, 0.047580999989804695, 0.04905100013274932, 0.0565310001547914, 0.04807099981007923], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04896100017504068, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} | |
| {"ts": "2025-11-10T22:11:37Z", "run": "b81f5729b90144f29ef4b2b3f014bb6b", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T512_D2048", "num_tokens": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.046829999973851955, "p50": 0.04784099996868463, "p90": 0.0479610000638786, "mean": 0.047636799990868894, "iqr": 0.001030000021273736, "raw_times": [0.046829999973851955, 0.048620999905324425, 0.046931000042604865, 0.0479610000638786, 0.04784099996868463], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05104100000608014, "peak_bytes": 20971520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} | |