diff --git a/activation/impls/artifacts/benchmark/activation.jsonl b/activation/impls/artifacts/benchmark/activation.jsonl index 01ad9024b2315e31ab7b238c36c56d81058547ae..5005b51b61d0820b9357a26624cb12042367144c 100644 --- a/activation/impls/artifacts/benchmark/activation.jsonl +++ b/activation/impls/artifacts/benchmark/activation.jsonl @@ -1,9 +1,9 @@ -{"ts": "2025-12-19T19:54:24Z", "run": "9f2d48bd832042b09eca95e62da3c0de", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T128_D768", "num_tokens": 128, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04233100003148138, "p50": 0.043751000021075015, "p90": 0.044161999994685175, "mean": 0.04361539999990782, "iqr": 0.001740000016070553, "raw_times": [0.044161999994685175, 0.04541099997368292, 0.04242199997861462, 0.043751000021075015, 0.04233100003148138], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05063100002189458, "peak_bytes": 1966080, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} -{"ts": "2025-12-19T19:54:24Z", "run": "9f2d48bd832042b09eca95e62da3c0de", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T128_D1024", "num_tokens": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.054010999974707374, "p50": 0.05540200004361395, "p90": 0.05709199990633351, "mean": 0.057631800018498325, "iqr": 0.0019599997358454857, "raw_times": [0.054010999974707374, 0.05513200017048803, 0.06652199999734876, 0.05540200004361395, 0.05709199990633351], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05926099993303069, "peak_bytes": 2621440, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} -{"ts": "2025-12-19T19:54:24Z", "run": "9f2d48bd832042b09eca95e62da3c0de", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T128_D2048", "num_tokens": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05346099987946218, "p50": 0.054341999884854886, "p90": 0.05543199995372561, "mean": 0.054953799917711876, "iqr": 0.001390000079481979, "raw_times": [0.05346099987946218, 0.05543199995372561, 0.05749199999627308, 0.054341999884854886, 0.05404199987424363], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05924099991716503, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} -{"ts": "2025-12-19T19:54:24Z", "run": "9f2d48bd832042b09eca95e62da3c0de", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T256_D768", "num_tokens": 256, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.053281999953469494, "p50": 0.054581999847869156, "p90": 0.05551200001718826, "mean": 0.054651799973726156, "iqr": 0.0014510001165035646, "raw_times": [0.05406099990068469, 0.05582200014941918, 0.05551200001718826, 0.054581999847869156, 0.053281999953469494], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05814099995404831, "peak_bytes": 3932160, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} -{"ts": "2025-12-19T19:54:24Z", "run": "9f2d48bd832042b09eca95e62da3c0de", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T256_D1024", "num_tokens": 256, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05309099992700794, "p50": 0.05449100012810959, "p90": 0.05478200000652578, "mean": 0.05435540001599293, "iqr": 0.0010310000106983352, "raw_times": [0.05449100012810959, 0.055662000022493885, 0.05375099999582744, 0.05309099992700794, 0.05478200000652578], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.057451999964541756, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} -{"ts": "2025-12-19T19:54:24Z", "run": "9f2d48bd832042b09eca95e62da3c0de", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T256_D2048", "num_tokens": 256, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.051550999842220335, "p50": 0.052460999995673774, "p90": 0.05307099991114228, "mean": 0.05247719996077649, "iqr": 0.000889999910214101, "raw_times": [0.051550999842220335, 0.05307099991114228, 0.05218100000092818, 0.052460999995673774, 0.05312200005391787], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.07207299995570793, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} -{"ts": "2025-12-19T19:54:24Z", "run": "9f2d48bd832042b09eca95e62da3c0de", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T512_D768", "num_tokens": 512, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052890999995725, "p50": 0.05325200004335784, "p90": 0.054772000112279784, "mean": 0.053839400061406195, "iqr": 0.001821000068957801, "raw_times": [0.05295100004332198, 0.054772000112279784, 0.05325200004335784, 0.052890999995725, 0.055331000112346373], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05688200008080457, "peak_bytes": 7864320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} -{"ts": "2025-12-19T19:54:24Z", "run": "9f2d48bd832042b09eca95e62da3c0de", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T512_D1024", "num_tokens": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05233100000623381, "p50": 0.054522000027645845, "p90": 0.05475100010698952, "mean": 0.05385140002545086, "iqr": 0.0020300001324358163, "raw_times": [0.052720999974553706, 0.05475100010698952, 0.05233100000623381, 0.054932000011831406, 0.054522000027645845], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.056971000049088616, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} -{"ts": "2025-12-19T19:54:24Z", "run": "9f2d48bd832042b09eca95e62da3c0de", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T512_D2048", "num_tokens": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052550999953382416, "p50": 0.05365099991649913, "p90": 0.053941000032864395, "mean": 0.053534999960902496, "iqr": 0.0006200000370881753, "raw_times": [0.05421099990599032, 0.05365099991649913, 0.052550999953382416, 0.053941000032864395, 0.05332099999577622], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.058042000091518275, "peak_bytes": 20971520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-12-19T22:43:48Z", "run": "4f266502732344bb9d0a3cd9527f873a", "impl": "torch_eager_darwin", "tags": {"family": "pytorch", "backend": "eager", "platform": "darwin"}, "wl": {"name": "cpu_T128_D768", "num_tokens": 128, "hidden_dim": 768, "dtype": "float32", "device": "cpu", "seed": 0}, "env": {"torch": "2.8.0", "cuda": "", "gpu": "", "sm": "", "py": "3.11.14", "plat": "macOS-15.7.2-arm64-arm-64bit"}, "lat_ms": {"p10": 0.13450000000148066, "p50": 0.1411669999811238, "p90": 0.1532919999931437, "mean": 0.1477000000022599, "iqr": 0.017083999978240172, "raw_times": [0.13620800001490352, 0.1733330000206479, 0.1532919999931437, 0.1411669999811238, 0.13450000000148066], "has_warnings": true, "reps": 5, "warmup": 2}, "compile_ms": 0.1447500000040236, "peak_bytes": null, "ok": false, "absmax": 0.04913330078125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.04913330078125, "mae": 0.0008915023063309491, "mse": 4.496400833886582e-06, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-12-19T22:43:48Z", "run": "4f266502732344bb9d0a3cd9527f873a", "impl": "torch_eager_darwin", "tags": {"family": "pytorch", "backend": "eager", "platform": "darwin"}, "wl": {"name": "cpu_T128_D1024", "num_tokens": 128, "hidden_dim": 1024, "dtype": "float32", "device": "cpu", "seed": 0}, "env": {"torch": "2.8.0", "cuda": "", "gpu": "", "sm": "", "py": "3.11.14", "plat": "macOS-15.7.2-arm64-arm-64bit"}, "lat_ms": {"p10": 0.1742909999506992, "p50": 0.17550000001165245, "p90": 0.17633400000249821, "mean": 0.17563320000135718, "iqr": 0.001000999986899842, "raw_times": [0.1742909999506992, 0.17633400000249821, 0.17533300001559837, 0.17670800002633769, 0.17550000001165245], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.731916999995974, "peak_bytes": null, "ok": false, "absmax": 0.06802082061767578, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.06802082061767578, "mae": 0.0008884685230441391, "mse": 4.475335117604118e-06, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-12-19T22:43:48Z", "run": "4f266502732344bb9d0a3cd9527f873a", "impl": "torch_eager_darwin", "tags": {"family": "pytorch", "backend": "eager", "platform": "darwin"}, "wl": {"name": "cpu_T128_D2048", "num_tokens": 128, "hidden_dim": 2048, "dtype": "float32", "device": "cpu", "seed": 0}, "env": {"torch": "2.8.0", "cuda": "", "gpu": "", "sm": "", "py": "3.11.14", "plat": "macOS-15.7.2-arm64-arm-64bit"}, "lat_ms": {"p10": 0.35966699999789853, "p50": 0.3839590000325188, "p90": 0.4197920000024169, "mean": 0.3930668000066362, "iqr": 0.05745900000420079, "raw_times": [0.35966699999789853, 0.3623329999982161, 0.4395830000021306, 0.4197920000024169, 0.3839590000325188], "has_warnings": true, "reps": 5, "warmup": 2}, "compile_ms": 0.37070900003755014, "peak_bytes": null, "ok": false, "absmax": 0.07091712951660156, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.07091712951660156, "mae": 0.0008893357589840889, "mse": 4.469751274882583e-06, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-12-19T22:43:48Z", "run": "4f266502732344bb9d0a3cd9527f873a", "impl": "torch_eager_darwin", "tags": {"family": "pytorch", "backend": "eager", "platform": "darwin"}, "wl": {"name": "cpu_T256_D768", "num_tokens": 256, "hidden_dim": 768, "dtype": "float32", "device": "cpu", "seed": 0}, "env": {"torch": "2.8.0", "cuda": "", "gpu": "", "sm": "", "py": "3.11.14", "plat": "macOS-15.7.2-arm64-arm-64bit"}, "lat_ms": {"p10": 0.27337500000612636, "p50": 0.325791999955527, "p90": 0.3564579999988382, "mean": 0.5360415999916768, "iqr": 0.03887500002974775, "raw_times": [1.4070000000288019, 0.3564579999988382, 0.325791999955527, 0.27337500000612636, 0.31758299996909045], "has_warnings": true, "reps": 5, "warmup": 2}, "compile_ms": 1.2649170000145205, "peak_bytes": null, "ok": false, "absmax": 0.04913330078125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.04913330078125, "mae": 0.0008873133920133114, "mse": 4.3958548303635325e-06, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-12-19T22:43:48Z", "run": "4f266502732344bb9d0a3cd9527f873a", "impl": "torch_eager_darwin", "tags": {"family": "pytorch", "backend": "eager", "platform": "darwin"}, "wl": {"name": "cpu_T256_D1024", "num_tokens": 256, "hidden_dim": 1024, "dtype": "float32", "device": "cpu", "seed": 0}, "env": {"torch": "2.8.0", "cuda": "", "gpu": "", "sm": "", "py": "3.11.14", "plat": "macOS-15.7.2-arm64-arm-64bit"}, "lat_ms": {"p10": 0.3514999999651991, "p50": 0.39737500003411697, "p90": 0.42058299999325754, "mean": 0.44304979999196803, "iqr": 0.05525000000261571, "raw_times": [0.42058299999325754, 0.39737500003411697, 0.6804579999766247, 0.3514999999651991, 0.36533299999064184], "has_warnings": true, "reps": 5, "warmup": 2}, "compile_ms": 1.976333000015984, "peak_bytes": null, "ok": false, "absmax": 0.06802082061767578, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.06802082061767578, "mae": 0.0008889895398169756, "mse": 4.431089109857567e-06, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-12-19T22:43:48Z", "run": "4f266502732344bb9d0a3cd9527f873a", "impl": "torch_eager_darwin", "tags": {"family": "pytorch", "backend": "eager", "platform": "darwin"}, "wl": {"name": "cpu_T256_D2048", "num_tokens": 256, "hidden_dim": 2048, "dtype": "float32", "device": "cpu", "seed": 0}, "env": {"torch": "2.8.0", "cuda": "", "gpu": "", "sm": "", "py": "3.11.14", "plat": "macOS-15.7.2-arm64-arm-64bit"}, "lat_ms": {"p10": 0.9706249999794636, "p50": 0.9802499999977954, "p90": 3.842000000020107, "mean": 2.413258199999291, "iqr": 2.863209000054212, "raw_times": [3.842000000020107, 5.294625000033193, 0.9802499999977954, 0.978790999965895, 0.9706249999794636], "has_warnings": true, "reps": 5, "warmup": 2}, "compile_ms": 2.3860840000224925, "peak_bytes": null, "ok": false, "absmax": 0.08395957946777344, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.08395957946777344, "mae": 0.0008889408782124519, "mse": 4.476671620068373e-06, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-12-19T22:43:48Z", "run": "4f266502732344bb9d0a3cd9527f873a", "impl": "torch_eager_darwin", "tags": {"family": "pytorch", "backend": "eager", "platform": "darwin"}, "wl": {"name": "cpu_T512_D768", "num_tokens": 512, "hidden_dim": 768, "dtype": "float32", "device": "cpu", "seed": 0}, "env": {"torch": "2.8.0", "cuda": "", "gpu": "", "sm": "", "py": "3.11.14", "plat": "macOS-15.7.2-arm64-arm-64bit"}, "lat_ms": {"p10": 0.6639999999720203, "p50": 0.8687079999845082, "p90": 1.1298749999468782, "mean": 0.9603583999933107, "iqr": 0.2749159999098083, "raw_times": [0.8549590000370699, 1.284250000026077, 1.1298749999468782, 0.6639999999720203, 0.8687079999845082], "has_warnings": true, "reps": 5, "warmup": 2}, "compile_ms": 0.7134589999964192, "peak_bytes": null, "ok": false, "absmax": 0.05687236785888672, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.05687236785888672, "mae": 0.0008884922135621309, "mse": 4.399109002406476e-06, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-12-19T22:43:48Z", "run": "4f266502732344bb9d0a3cd9527f873a", "impl": "torch_eager_darwin", "tags": {"family": "pytorch", "backend": "eager", "platform": "darwin"}, "wl": {"name": "cpu_T512_D1024", "num_tokens": 512, "hidden_dim": 1024, "dtype": "float32", "device": "cpu", "seed": 0}, "env": {"torch": "2.8.0", "cuda": "", "gpu": "", "sm": "", "py": "3.11.14", "plat": "macOS-15.7.2-arm64-arm-64bit"}, "lat_ms": {"p10": 1.141958000005161, "p50": 1.6311670000277445, "p90": 1.6544580000186215, "mean": 1.7749248000086482, "iqr": 0.366167000038331, "raw_times": [1.6544580000186215, 1.2882909999802905, 3.1587500000114233, 1.6311670000277445, 1.141958000005161], "has_warnings": true, "reps": 5, "warmup": 2}, "compile_ms": 2.0730410000169286, "peak_bytes": null, "ok": false, "absmax": 0.06802082061767578, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.06802082061767578, "mae": 0.0008890957687981427, "mse": 4.448749677976593e-06, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-12-19T22:43:49Z", "run": "4f266502732344bb9d0a3cd9527f873a", "impl": "torch_eager_darwin", "tags": {"family": "pytorch", "backend": "eager", "platform": "darwin"}, "wl": {"name": "cpu_T512_D2048", "num_tokens": 512, "hidden_dim": 2048, "dtype": "float32", "device": "cpu", "seed": 0}, "env": {"torch": "2.8.0", "cuda": "", "gpu": "", "sm": "", "py": "3.11.14", "plat": "macOS-15.7.2-arm64-arm-64bit"}, "lat_ms": {"p10": 2.664708999986942, "p50": 3.365374999987125, "p90": 3.6645420000240847, "mean": 3.5541085999966526, "iqr": 0.8831670000404301, "raw_times": [2.664708999986942, 3.6645420000240847, 3.365374999987125, 5.2945420000014565, 2.7813749999836546], "has_warnings": true, "reps": 5, "warmup": 2}, "compile_ms": 17.938291999996636, "peak_bytes": null, "ok": false, "absmax": 0.09098148345947266, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.09098148345947266, "mae": 0.0008892239420674741, "mse": 4.500504473980982e-06, "ref": "swiglu_fp32"}, "err": null} diff --git a/activation/impls/cells/benchmark.py b/activation/impls/cells/benchmark.py index 711af9e01652ef5081b507affd0f7df9ac99e644..fd785e205d9fbcbf9a01065929fe8402a83fcf03 100644 --- a/activation/impls/cells/benchmark.py +++ b/activation/impls/cells/benchmark.py @@ -22,7 +22,7 @@ def swiglu_eager(x): run_benchmark( kernel_type=KernelTypeEnum.ACTIVATION, - impl_name="torch_eager", - impl_tags={"family":"hf-kernels", "backend":"eager"}, + impl_name="torch_eager_darwin", + impl_tags={"family":"pytorch", "backend":"eager", "platform": "darwin"}, impl_func=swiglu_eager, ) \ No newline at end of file diff --git a/activation/impls/cells/sysinfo.py b/activation/impls/cells/sysinfo.py new file mode 100644 index 0000000000000000000000000000000000000000..b2c8722bdf08e8bd2c4f4a673833bd0e5db39fc0 --- /dev/null +++ b/activation/impls/cells/sysinfo.py @@ -0,0 +1,14 @@ +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "torch==2.8.0", +# ] +# /// +import platform +import subprocess +print(f"Platform: {platform.system()} {platform.machine()}") +print(f"Python: {platform.python_version()}") +# Check for MPS availability +import torch +print(f"PyTorch: {torch.__version__}") +print(f"MPS available: {torch.backends.mps.is_available()}") \ No newline at end of file diff --git a/activation/impls/hf_kernels_swiglu.html b/activation/impls/hf_kernels_swiglu.html index f8d816a75a59b5a7e634d3e627b3d6b3842f78fa..27593f80515fb851d5c18e04f950f201441aca78 100644 --- a/activation/impls/hf_kernels_swiglu.html +++ b/activation/impls/hf_kernels_swiglu.html @@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content { ▼ output ▶ uv-logs | -Cell: nv | 0.29s +Cell: nv | 0.25s | Raw @@ -3905,7 +3905,7 @@ Cell: nv | 0.29s
Fri Dec 19 19:54:13 2025 +Fri Dec 19 23:01:11 2025 +-----------------------------------------------------------------------------------------+ | NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 | +-----------------------------------------+------------------------+----------------------+ @@ -3914,7 +3914,7 @@ Cell: nv | 0.29s | | | MIG M. | |=========================================+========================+======================| | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 | -| N/A 35C P0 120W / 350W | 0MiB / 46068MiB | 100% Default | +| N/A 39C P0 82W / 350W | 0MiB / 46068MiB | 10% Default | | | | N/A | +-----------------------------------------+------------------------+----------------------+ @@ -3938,7 +3938,7 @@ Cell: nv | 0.29s ▼ output ▶ uv-logs | -Cell: benchmark | 8.35s +Cell: benchmark | 8.49s | Raw @@ -3995,16 +3995,16 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D768 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 85.600us 2073.64% 85.600us 85.600us 1 - hf_kernels_swiglu 8.76% 183.666us 99.29% 2.081ms 2.081ms 0.000us 0.00% 5.568us 5.568us 1 - _activation_23bf3fb::silu_and_mul 0.98% 20.570us 88.50% 1.855ms 618.341us 4.128us 100.00% 5.568us 1.856us 3 + hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 76.129us 1844.21% 76.129us 76.129us 1 + hf_kernels_swiglu 8.60% 174.603us 99.27% 2.015ms 2.015ms 0.000us 0.00% 5.568us 5.568us 1 + _activation_23bf3fb::silu_and_mul 0.97% 19.670us 88.54% 1.797ms 599.020us 4.128us 100.00% 5.568us 1.856us 3 void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.128us 100.00% 4.128us 1.376us 3 - Activity Buffer Request 85.39% 1.790ms 85.39% 1.790ms 1.790ms 1.440us 34.88% 1.440us 1.440us 1 - aten::empty 2.03% 42.471us 2.03% 42.471us 14.157us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 2.13% 44.611us 2.13% 44.611us 14.870us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.71% 14.820us 0.71% 14.820us 14.820us 0.000us 0.00% 0.000us 0.000us 1 + Activity Buffer Request 85.37% 1.733ms 85.37% 1.733ms 1.733ms 1.440us 34.88% 1.440us 1.440us 1 + aten::empty 2.13% 43.191us 2.13% 43.191us 14.397us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 2.20% 44.752us 2.20% 44.752us 14.917us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.73% 14.741us 0.73% 14.741us 14.741us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.096ms +Self CPU time total: 2.030ms Self CUDA time total: 4.128us @@ -4015,17 +4015,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D1024 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 66.111us 1666.52% 66.111us 66.111us 1 - hf_kernels_swiglu 4.94% 94.004us 99.69% 1.897ms 1.897ms 0.000us 0.00% 5.311us 5.311us 1 - _activation_23bf3fb::silu_and_mul 0.99% 18.841us 93.73% 1.783ms 594.417us 3.967us 100.00% 5.311us 1.770us 3 -void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 3.967us 100.00% 3.967us 1.322us 3 - Activity Buffer Request 91.36% 1.738ms 91.36% 1.738ms 1.738ms 1.344us 33.88% 1.344us 1.344us 1 - aten::empty 1.01% 19.260us 1.01% 19.260us 6.420us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 1.38% 26.230us 1.38% 26.230us 8.743us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.31% 5.950us 0.31% 5.950us 5.950us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 62.783us 1582.23% 62.783us 62.783us 1 + hf_kernels_swiglu 4.95% 92.601us 99.70% 1.863ms 1.863ms 0.000us 0.00% 5.312us 5.312us 1 + _activation_23bf3fb::silu_and_mul 1.25% 23.392us 93.77% 1.753ms 584.220us 3.968us 100.00% 5.312us 1.771us 3 +void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 3.968us 100.00% 3.968us 1.323us 3 + Activity Buffer Request 91.17% 1.704ms 91.17% 1.704ms 1.704ms 1.344us 33.87% 1.344us 1.344us 1 + aten::empty 0.97% 18.160us 0.97% 18.160us 6.053us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 1.35% 25.221us 1.35% 25.221us 8.407us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.30% 5.620us 0.30% 5.620us 5.620us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.902ms -Self CUDA time total: 3.967us +Self CPU time total: 1.869ms +Self CUDA time total: 3.968us @@ -4035,17 +4035,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D2048 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 68.479us 1380.35% 68.479us 68.479us 1 - hf_kernels_swiglu 4.69% 88.684us 99.71% 1.886ms 1.886ms 0.000us 0.00% 6.625us 6.625us 1 - _activation_23bf3fb::silu_and_mul 0.99% 18.661us 94.04% 1.778ms 592.827us 4.961us 100.00% 6.625us 2.208us 3 -void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.961us 100.00% 4.961us 1.654us 3 - Activity Buffer Request 91.53% 1.731ms 91.53% 1.731ms 1.731ms 1.664us 33.54% 1.664us 1.664us 1 - aten::empty 0.98% 18.610us 0.98% 18.610us 6.203us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 1.52% 28.800us 1.52% 28.800us 9.600us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.29% 5.500us 0.29% 5.500us 5.500us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 61.887us 1264.03% 61.887us 61.887us 1 + hf_kernels_swiglu 4.90% 91.392us 99.70% 1.861ms 1.861ms 0.000us 0.00% 6.528us 6.528us 1 + _activation_23bf3fb::silu_and_mul 1.06% 19.772us 93.81% 1.751ms 583.690us 4.896us 100.00% 6.528us 2.176us 3 +void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.896us 100.00% 4.896us 1.632us 3 + Activity Buffer Request 91.42% 1.706ms 91.42% 1.706ms 1.706ms 1.632us 33.33% 1.632us 1.632us 1 + aten::empty 1.00% 18.580us 1.00% 18.580us 6.193us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 1.33% 24.870us 1.33% 24.870us 8.290us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.30% 5.640us 0.30% 5.640us 5.640us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.891ms -Self CUDA time total: 4.961us +Self CPU time total: 1.867ms +Self CUDA time total: 4.896us @@ -4055,17 +4055,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D768 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 66.368us 1547.76% 66.368us 66.368us 1 - hf_kernels_swiglu 4.25% 87.402us 99.76% 2.051ms 2.051ms 0.000us 0.00% 5.760us 5.760us 1 - _activation_23bf3fb::silu_and_mul 0.97% 19.981us 94.58% 1.945ms 648.228us 4.288us 100.00% 5.760us 1.920us 3 -void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.288us 100.00% 4.288us 1.429us 3 - Activity Buffer Request 83.83% 1.724ms 83.83% 1.724ms 1.724ms 1.472us 34.33% 1.472us 1.472us 1 - aten::empty 0.93% 19.111us 0.93% 19.111us 6.370us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 9.77% 200.885us 9.77% 200.885us 66.962us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.24% 5.020us 0.24% 5.020us 5.020us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 66.431us 1560.88% 66.431us 66.431us 1 + hf_kernels_swiglu 4.62% 96.552us 99.72% 2.084ms 2.084ms 0.000us 0.00% 5.696us 5.696us 1 + _activation_23bf3fb::silu_and_mul 0.92% 19.230us 94.20% 1.969ms 656.267us 4.256us 100.00% 5.696us 1.899us 3 +void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.256us 100.00% 4.256us 1.419us 3 + Activity Buffer Request 82.63% 1.727ms 82.63% 1.727ms 1.727ms 1.440us 33.83% 1.440us 1.440us 1 + aten::empty 0.91% 18.961us 0.91% 18.961us 6.320us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 10.64% 222.454us 10.64% 222.454us 74.151us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.28% 5.800us 0.28% 5.800us 5.800us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.056ms -Self CUDA time total: 4.288us +Self CPU time total: 2.090ms +Self CUDA time total: 4.256us @@ -4075,17 +4075,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D1024 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 67.360us 1131.72% 67.360us 67.360us 1 - hf_kernels_swiglu 4.31% 89.293us 99.77% 2.067ms 2.067ms 0.000us 0.00% 7.968us 7.968us 1 - _activation_23bf3fb::silu_and_mul 0.98% 20.220us 94.55% 1.959ms 652.859us 5.952us 100.00% 7.968us 2.656us 3 -void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 5.952us 100.00% 5.952us 1.984us 3 - Activity Buffer Request 85.78% 1.777ms 85.78% 1.777ms 1.777ms 2.016us 33.87% 2.016us 2.016us 1 - aten::empty 0.91% 18.861us 0.91% 18.861us 6.287us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 7.79% 161.464us 7.79% 161.464us 53.821us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.23% 4.820us 0.23% 4.820us 4.820us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 62.753us 1065.60% 62.753us 62.753us 1 + hf_kernels_swiglu 4.32% 90.233us 99.73% 2.084ms 2.084ms 0.000us 0.00% 7.842us 7.842us 1 + _activation_23bf3fb::silu_and_mul 0.98% 20.530us 94.51% 1.975ms 658.421us 5.889us 100.00% 7.842us 2.614us 3 +void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 5.889us 100.00% 5.889us 1.963us 3 + Activity Buffer Request 83.43% 1.744ms 83.43% 1.744ms 1.744ms 1.953us 33.16% 1.953us 1.953us 1 + aten::empty 0.90% 18.820us 0.90% 18.820us 6.273us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 10.09% 210.974us 10.09% 210.974us 70.325us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.27% 5.680us 0.27% 5.680us 5.680us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.072ms -Self CUDA time total: 5.952us +Self CPU time total: 2.090ms +Self CUDA time total: 5.889us @@ -4095,17 +4095,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D2048 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 64.574us 830.43% 64.574us 64.574us 1 - hf_kernels_swiglu 18.42% 86.111us 98.86% 462.073us 462.073us 0.000us 0.00% 10.367us 10.367us 1 - _activation_23bf3fb::silu_and_mul 4.27% 19.980us 76.48% 357.451us 119.150us 7.776us 100.00% 10.367us 3.456us 3 -void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 7.776us 100.00% 7.776us 2.592us 3 - Activity Buffer Request 38.90% 181.805us 38.90% 181.805us 181.805us 2.591us 33.32% 2.591us 2.591us 1 - aten::empty 3.96% 18.511us 3.96% 18.511us 6.170us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 33.30% 155.666us 33.30% 155.666us 51.889us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 1.14% 5.330us 1.14% 5.330us 5.330us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 58.974us 761.74% 58.974us 58.974us 1 + hf_kernels_swiglu 14.39% 83.563us 99.11% 575.543us 575.543us 0.000us 0.00% 10.333us 10.333us 1 + _activation_23bf3fb::silu_and_mul 3.37% 19.590us 81.67% 474.270us 158.090us 7.742us 100.00% 10.333us 3.444us 3 +void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 7.742us 100.00% 7.742us 2.581us 3 + Activity Buffer Request 43.30% 251.476us 43.30% 251.476us 251.476us 2.591us 33.47% 2.591us 2.591us 1 + aten::empty 3.05% 17.710us 3.05% 17.710us 5.903us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 34.99% 203.204us 34.99% 203.204us 67.735us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.89% 5.190us 0.89% 5.190us 5.190us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 467.403us -Self CUDA time total: 7.776us +Self CPU time total: 580.733us +Self CUDA time total: 7.742us @@ -4115,16 +4115,16 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D768 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 62.527us 943.95% 62.527us 62.527us 1 - hf_kernels_swiglu 18.86% 83.092us 98.85% 435.523us 435.523us 0.000us 0.00% 8.832us 8.832us 1 - _activation_23bf3fb::silu_and_mul 4.63% 20.380us 75.83% 334.080us 111.360us 6.624us 100.00% 8.832us 2.944us 3 + hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 60.191us 908.68% 60.191us 60.191us 1 + hf_kernels_swiglu 14.49% 83.902us 99.19% 574.293us 574.293us 0.000us 0.00% 8.832us 8.832us 1 + _activation_23bf3fb::silu_and_mul 3.38% 19.561us 81.54% 472.101us 157.367us 6.624us 100.00% 8.832us 2.944us 3 void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 6.624us 100.00% 6.624us 2.208us 3 - Activity Buffer Request 36.44% 160.555us 36.44% 160.555us 160.555us 2.208us 33.33% 2.208us 2.208us 1 - aten::empty 4.17% 18.351us 4.17% 18.351us 6.117us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 34.76% 153.145us 34.76% 153.145us 51.048us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 1.15% 5.060us 1.15% 5.060us 5.060us 0.000us 0.00% 0.000us 0.000us 1 + Activity Buffer Request 43.39% 251.205us 43.39% 251.205us 251.205us 2.208us 33.33% 2.208us 2.208us 1 + aten::empty 3.16% 18.290us 3.16% 18.290us 6.097us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 34.77% 201.335us 34.77% 201.335us 67.112us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.81% 4.680us 0.81% 4.680us 4.680us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 440.583us +Self CPU time total: 578.973us Self CUDA time total: 6.624us @@ -4135,17 +4135,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D1024 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 69.184us 732.88% 69.184us 69.184us 1 - hf_kernels_swiglu 4.54% 90.562us 99.76% 1.988ms 1.988ms 0.000us 0.00% 12.608us 12.608us 1 - _activation_23bf3fb::silu_and_mul 1.02% 20.260us 94.19% 1.877ms 625.705us 9.440us 100.00% 12.608us 4.203us 3 -void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 9.440us 100.00% 9.440us 3.147us 3 - Activity Buffer Request 85.41% 1.702ms 85.41% 1.702ms 1.702ms 3.168us 33.56% 3.168us 3.168us 1 - aten::empty 1.03% 20.450us 1.03% 20.450us 6.817us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 7.76% 154.666us 7.76% 154.666us 51.555us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.24% 4.870us 0.24% 4.870us 4.870us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 64.480us 685.45% 64.480us 64.480us 1 + hf_kernels_swiglu 4.47% 90.662us 99.76% 2.023ms 2.023ms 0.000us 0.00% 12.543us 12.543us 1 + _activation_23bf3fb::silu_and_mul 0.98% 19.960us 94.38% 1.913ms 637.817us 9.407us 100.00% 12.543us 4.181us 3 +void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 9.407us 100.00% 9.407us 3.136us 3 + Activity Buffer Request 83.63% 1.695ms 83.63% 1.695ms 1.695ms 3.136us 33.34% 3.136us 3.136us 1 + aten::empty 0.91% 18.421us 0.91% 18.421us 6.140us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 9.77% 198.004us 9.77% 198.004us 66.001us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.24% 4.950us 0.24% 4.950us 4.950us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.993ms -Self CUDA time total: 9.440us +Self CPU time total: 2.027ms +Self CUDA time total: 9.407us @@ -4155,17 +4155,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D2048 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 65.376us 499.51% 65.376us 65.376us 1 - hf_kernels_swiglu 19.52% 83.334us 98.75% 421.512us 421.512us 0.000us 0.00% 17.472us 17.472us 1 - _activation_23bf3fb::silu_and_mul 4.53% 19.340us 74.78% 319.198us 106.399us 13.088us 100.00% 17.472us 5.824us 3 -void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 13.088us 100.00% 13.088us 4.363us 3 - Activity Buffer Request 34.31% 146.444us 34.31% 146.444us 146.444us 4.384us 33.50% 4.384us 4.384us 1 - aten::empty 4.45% 18.980us 4.45% 18.980us 6.327us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 35.94% 153.414us 35.94% 153.414us 51.138us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 1.25% 5.350us 1.25% 5.350us 5.350us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 60.576us 465.11% 60.576us 60.576us 1 + hf_kernels_swiglu 15.18% 83.082us 99.12% 542.352us 542.352us 0.000us 0.00% 17.408us 17.408us 1 + _activation_23bf3fb::silu_and_mul 3.66% 20.041us 80.66% 441.340us 147.113us 13.024us 100.00% 17.408us 5.803us 3 +void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 13.024us 100.00% 13.024us 4.341us 3 + Activity Buffer Request 41.24% 225.625us 41.24% 225.625us 225.625us 4.384us 33.66% 4.384us 4.384us 1 + aten::empty 3.28% 17.930us 3.28% 17.930us 5.977us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 35.76% 195.674us 35.76% 195.674us 65.225us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.88% 4.811us 0.88% 4.811us 4.811us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 426.862us -Self CUDA time total: 13.088us +Self CPU time total: 547.163us +Self CUDA time total: 13.024us impl wl p50(ms) ok @@ -4182,13 +4182,14 @@ hf_kernels_swiglu cuda_T512_D768 0.03 True-▶ UV Install LogsFetching 7 files: 0%| | 0/7 [00:00<?, ?it/s] -Fetching 7 files: 14%|█▍ | 1/7 [00:00<00:01, 5.80it/s] -Fetching 7 files: 71%|███████▏ | 5/7 [00:00<00:00, 13.68it/s] -Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00, 17.69it/s]+Fetching 7 files: 0%| | 0/7 [00:00<?, ?it/s]Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads. + +Fetching 7 files: 29%|██▊ | 2/7 [00:00<00:00, 17.51it/s] +Fetching 7 files: 71%|███████▏ | 5/7 [00:00<00:00, 14.39it/s] +Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00, 20.57it/s]Artifacts:
activation.jsonl diff --git a/activation/impls/index.html b/activation/impls/index.html index 02d457f5814d7ec7515a6c7ef12f11b92d7783cf..4bb1f39a085a80b189a23e35553192c7e762dcee 100644 --- a/activation/impls/index.html +++ b/activation/impls/index.html @@ -82,8 +82,7 @@Index of /activation/impls