diff --git a/activation/impls/artifacts/benchmark/activation.jsonl b/activation/impls/artifacts/benchmark/activation.jsonl index 01ad9024b2315e31ab7b238c36c56d81058547ae..5005b51b61d0820b9357a26624cb12042367144c 100644 --- a/activation/impls/artifacts/benchmark/activation.jsonl +++ b/activation/impls/artifacts/benchmark/activation.jsonl @@ -1,9 +1,9 @@ -{"ts": "2025-12-19T19:54:24Z", "run": "9f2d48bd832042b09eca95e62da3c0de", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T128_D768", "num_tokens": 128, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04233100003148138, "p50": 0.043751000021075015, "p90": 0.044161999994685175, "mean": 0.04361539999990782, "iqr": 0.001740000016070553, "raw_times": [0.044161999994685175, 0.04541099997368292, 0.04242199997861462, 0.043751000021075015, 0.04233100003148138], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05063100002189458, "peak_bytes": 1966080, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} -{"ts": "2025-12-19T19:54:24Z", "run": "9f2d48bd832042b09eca95e62da3c0de", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T128_D1024", "num_tokens": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.054010999974707374, "p50": 0.05540200004361395, "p90": 0.05709199990633351, "mean": 0.057631800018498325, "iqr": 0.0019599997358454857, "raw_times": [0.054010999974707374, 0.05513200017048803, 0.06652199999734876, 0.05540200004361395, 0.05709199990633351], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05926099993303069, "peak_bytes": 2621440, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} -{"ts": "2025-12-19T19:54:24Z", "run": "9f2d48bd832042b09eca95e62da3c0de", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T128_D2048", "num_tokens": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05346099987946218, "p50": 0.054341999884854886, "p90": 0.05543199995372561, "mean": 0.054953799917711876, "iqr": 0.001390000079481979, "raw_times": [0.05346099987946218, 0.05543199995372561, 0.05749199999627308, 0.054341999884854886, 0.05404199987424363], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05924099991716503, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} -{"ts": "2025-12-19T19:54:24Z", "run": "9f2d48bd832042b09eca95e62da3c0de", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T256_D768", "num_tokens": 256, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.053281999953469494, "p50": 0.054581999847869156, "p90": 0.05551200001718826, "mean": 0.054651799973726156, "iqr": 0.0014510001165035646, "raw_times": [0.05406099990068469, 0.05582200014941918, 0.05551200001718826, 0.054581999847869156, 0.053281999953469494], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05814099995404831, "peak_bytes": 3932160, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} -{"ts": "2025-12-19T19:54:24Z", "run": "9f2d48bd832042b09eca95e62da3c0de", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T256_D1024", "num_tokens": 256, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05309099992700794, "p50": 0.05449100012810959, "p90": 0.05478200000652578, "mean": 0.05435540001599293, "iqr": 0.0010310000106983352, "raw_times": [0.05449100012810959, 0.055662000022493885, 0.05375099999582744, 0.05309099992700794, 0.05478200000652578], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.057451999964541756, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} -{"ts": "2025-12-19T19:54:24Z", "run": "9f2d48bd832042b09eca95e62da3c0de", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T256_D2048", "num_tokens": 256, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.051550999842220335, "p50": 0.052460999995673774, "p90": 0.05307099991114228, "mean": 0.05247719996077649, "iqr": 0.000889999910214101, "raw_times": [0.051550999842220335, 0.05307099991114228, 0.05218100000092818, 0.052460999995673774, 0.05312200005391787], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.07207299995570793, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} -{"ts": "2025-12-19T19:54:24Z", "run": "9f2d48bd832042b09eca95e62da3c0de", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T512_D768", "num_tokens": 512, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052890999995725, "p50": 0.05325200004335784, "p90": 0.054772000112279784, "mean": 0.053839400061406195, "iqr": 0.001821000068957801, "raw_times": [0.05295100004332198, 0.054772000112279784, 0.05325200004335784, 0.052890999995725, 0.055331000112346373], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05688200008080457, "peak_bytes": 7864320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} -{"ts": "2025-12-19T19:54:24Z", "run": "9f2d48bd832042b09eca95e62da3c0de", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T512_D1024", "num_tokens": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05233100000623381, "p50": 0.054522000027645845, "p90": 0.05475100010698952, "mean": 0.05385140002545086, "iqr": 0.0020300001324358163, "raw_times": [0.052720999974553706, 0.05475100010698952, 0.05233100000623381, 0.054932000011831406, 0.054522000027645845], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.056971000049088616, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} -{"ts": "2025-12-19T19:54:24Z", "run": "9f2d48bd832042b09eca95e62da3c0de", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T512_D2048", "num_tokens": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052550999953382416, "p50": 0.05365099991649913, "p90": 0.053941000032864395, "mean": 0.053534999960902496, "iqr": 0.0006200000370881753, "raw_times": [0.05421099990599032, 0.05365099991649913, 0.052550999953382416, 0.053941000032864395, 0.05332099999577622], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.058042000091518275, "peak_bytes": 20971520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-12-19T22:43:48Z", "run": "4f266502732344bb9d0a3cd9527f873a", "impl": "torch_eager_darwin", "tags": {"family": "pytorch", "backend": "eager", "platform": "darwin"}, "wl": {"name": "cpu_T128_D768", "num_tokens": 128, "hidden_dim": 768, "dtype": "float32", "device": "cpu", "seed": 0}, "env": {"torch": "2.8.0", "cuda": "", "gpu": "", "sm": "", "py": "3.11.14", "plat": "macOS-15.7.2-arm64-arm-64bit"}, "lat_ms": {"p10": 0.13450000000148066, "p50": 0.1411669999811238, "p90": 0.1532919999931437, "mean": 0.1477000000022599, "iqr": 0.017083999978240172, "raw_times": [0.13620800001490352, 0.1733330000206479, 0.1532919999931437, 0.1411669999811238, 0.13450000000148066], "has_warnings": true, "reps": 5, "warmup": 2}, "compile_ms": 0.1447500000040236, "peak_bytes": null, "ok": false, "absmax": 0.04913330078125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.04913330078125, "mae": 0.0008915023063309491, "mse": 4.496400833886582e-06, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-12-19T22:43:48Z", "run": "4f266502732344bb9d0a3cd9527f873a", "impl": "torch_eager_darwin", "tags": {"family": "pytorch", "backend": "eager", "platform": "darwin"}, "wl": {"name": "cpu_T128_D1024", "num_tokens": 128, "hidden_dim": 1024, "dtype": "float32", "device": "cpu", "seed": 0}, "env": {"torch": "2.8.0", "cuda": "", "gpu": "", "sm": "", "py": "3.11.14", "plat": "macOS-15.7.2-arm64-arm-64bit"}, "lat_ms": {"p10": 0.1742909999506992, "p50": 0.17550000001165245, "p90": 0.17633400000249821, "mean": 0.17563320000135718, "iqr": 0.001000999986899842, "raw_times": [0.1742909999506992, 0.17633400000249821, 0.17533300001559837, 0.17670800002633769, 0.17550000001165245], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.731916999995974, "peak_bytes": null, "ok": false, "absmax": 0.06802082061767578, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.06802082061767578, "mae": 0.0008884685230441391, "mse": 4.475335117604118e-06, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-12-19T22:43:48Z", "run": "4f266502732344bb9d0a3cd9527f873a", "impl": "torch_eager_darwin", "tags": {"family": "pytorch", "backend": "eager", "platform": "darwin"}, "wl": {"name": "cpu_T128_D2048", "num_tokens": 128, "hidden_dim": 2048, "dtype": "float32", "device": "cpu", "seed": 0}, "env": {"torch": "2.8.0", "cuda": "", "gpu": "", "sm": "", "py": "3.11.14", "plat": "macOS-15.7.2-arm64-arm-64bit"}, "lat_ms": {"p10": 0.35966699999789853, "p50": 0.3839590000325188, "p90": 0.4197920000024169, "mean": 0.3930668000066362, "iqr": 0.05745900000420079, "raw_times": [0.35966699999789853, 0.3623329999982161, 0.4395830000021306, 0.4197920000024169, 0.3839590000325188], "has_warnings": true, "reps": 5, "warmup": 2}, "compile_ms": 0.37070900003755014, "peak_bytes": null, "ok": false, "absmax": 0.07091712951660156, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.07091712951660156, "mae": 0.0008893357589840889, "mse": 4.469751274882583e-06, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-12-19T22:43:48Z", "run": "4f266502732344bb9d0a3cd9527f873a", "impl": "torch_eager_darwin", "tags": {"family": "pytorch", "backend": "eager", "platform": "darwin"}, "wl": {"name": "cpu_T256_D768", "num_tokens": 256, "hidden_dim": 768, "dtype": "float32", "device": "cpu", "seed": 0}, "env": {"torch": "2.8.0", "cuda": "", "gpu": "", "sm": "", "py": "3.11.14", "plat": "macOS-15.7.2-arm64-arm-64bit"}, "lat_ms": {"p10": 0.27337500000612636, "p50": 0.325791999955527, "p90": 0.3564579999988382, "mean": 0.5360415999916768, "iqr": 0.03887500002974775, "raw_times": [1.4070000000288019, 0.3564579999988382, 0.325791999955527, 0.27337500000612636, 0.31758299996909045], "has_warnings": true, "reps": 5, "warmup": 2}, "compile_ms": 1.2649170000145205, "peak_bytes": null, "ok": false, "absmax": 0.04913330078125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.04913330078125, "mae": 0.0008873133920133114, "mse": 4.3958548303635325e-06, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-12-19T22:43:48Z", "run": "4f266502732344bb9d0a3cd9527f873a", "impl": "torch_eager_darwin", "tags": {"family": "pytorch", "backend": "eager", "platform": "darwin"}, "wl": {"name": "cpu_T256_D1024", "num_tokens": 256, "hidden_dim": 1024, "dtype": "float32", "device": "cpu", "seed": 0}, "env": {"torch": "2.8.0", "cuda": "", "gpu": "", "sm": "", "py": "3.11.14", "plat": "macOS-15.7.2-arm64-arm-64bit"}, "lat_ms": {"p10": 0.3514999999651991, "p50": 0.39737500003411697, "p90": 0.42058299999325754, "mean": 0.44304979999196803, "iqr": 0.05525000000261571, "raw_times": [0.42058299999325754, 0.39737500003411697, 0.6804579999766247, 0.3514999999651991, 0.36533299999064184], "has_warnings": true, "reps": 5, "warmup": 2}, "compile_ms": 1.976333000015984, "peak_bytes": null, "ok": false, "absmax": 0.06802082061767578, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.06802082061767578, "mae": 0.0008889895398169756, "mse": 4.431089109857567e-06, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-12-19T22:43:48Z", "run": "4f266502732344bb9d0a3cd9527f873a", "impl": "torch_eager_darwin", "tags": {"family": "pytorch", "backend": "eager", "platform": "darwin"}, "wl": {"name": "cpu_T256_D2048", "num_tokens": 256, "hidden_dim": 2048, "dtype": "float32", "device": "cpu", "seed": 0}, "env": {"torch": "2.8.0", "cuda": "", "gpu": "", "sm": "", "py": "3.11.14", "plat": "macOS-15.7.2-arm64-arm-64bit"}, "lat_ms": {"p10": 0.9706249999794636, "p50": 0.9802499999977954, "p90": 3.842000000020107, "mean": 2.413258199999291, "iqr": 2.863209000054212, "raw_times": [3.842000000020107, 5.294625000033193, 0.9802499999977954, 0.978790999965895, 0.9706249999794636], "has_warnings": true, "reps": 5, "warmup": 2}, "compile_ms": 2.3860840000224925, "peak_bytes": null, "ok": false, "absmax": 0.08395957946777344, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.08395957946777344, "mae": 0.0008889408782124519, "mse": 4.476671620068373e-06, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-12-19T22:43:48Z", "run": "4f266502732344bb9d0a3cd9527f873a", "impl": "torch_eager_darwin", "tags": {"family": "pytorch", "backend": "eager", "platform": "darwin"}, "wl": {"name": "cpu_T512_D768", "num_tokens": 512, "hidden_dim": 768, "dtype": "float32", "device": "cpu", "seed": 0}, "env": {"torch": "2.8.0", "cuda": "", "gpu": "", "sm": "", "py": "3.11.14", "plat": "macOS-15.7.2-arm64-arm-64bit"}, "lat_ms": {"p10": 0.6639999999720203, "p50": 0.8687079999845082, "p90": 1.1298749999468782, "mean": 0.9603583999933107, "iqr": 0.2749159999098083, "raw_times": [0.8549590000370699, 1.284250000026077, 1.1298749999468782, 0.6639999999720203, 0.8687079999845082], "has_warnings": true, "reps": 5, "warmup": 2}, "compile_ms": 0.7134589999964192, "peak_bytes": null, "ok": false, "absmax": 0.05687236785888672, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.05687236785888672, "mae": 0.0008884922135621309, "mse": 4.399109002406476e-06, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-12-19T22:43:48Z", "run": "4f266502732344bb9d0a3cd9527f873a", "impl": "torch_eager_darwin", "tags": {"family": "pytorch", "backend": "eager", "platform": "darwin"}, "wl": {"name": "cpu_T512_D1024", "num_tokens": 512, "hidden_dim": 1024, "dtype": "float32", "device": "cpu", "seed": 0}, "env": {"torch": "2.8.0", "cuda": "", "gpu": "", "sm": "", "py": "3.11.14", "plat": "macOS-15.7.2-arm64-arm-64bit"}, "lat_ms": {"p10": 1.141958000005161, "p50": 1.6311670000277445, "p90": 1.6544580000186215, "mean": 1.7749248000086482, "iqr": 0.366167000038331, "raw_times": [1.6544580000186215, 1.2882909999802905, 3.1587500000114233, 1.6311670000277445, 1.141958000005161], "has_warnings": true, "reps": 5, "warmup": 2}, "compile_ms": 2.0730410000169286, "peak_bytes": null, "ok": false, "absmax": 0.06802082061767578, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.06802082061767578, "mae": 0.0008890957687981427, "mse": 4.448749677976593e-06, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-12-19T22:43:49Z", "run": "4f266502732344bb9d0a3cd9527f873a", "impl": "torch_eager_darwin", "tags": {"family": "pytorch", "backend": "eager", "platform": "darwin"}, "wl": {"name": "cpu_T512_D2048", "num_tokens": 512, "hidden_dim": 2048, "dtype": "float32", "device": "cpu", "seed": 0}, "env": {"torch": "2.8.0", "cuda": "", "gpu": "", "sm": "", "py": "3.11.14", "plat": "macOS-15.7.2-arm64-arm-64bit"}, "lat_ms": {"p10": 2.664708999986942, "p50": 3.365374999987125, "p90": 3.6645420000240847, "mean": 3.5541085999966526, "iqr": 0.8831670000404301, "raw_times": [2.664708999986942, 3.6645420000240847, 3.365374999987125, 5.2945420000014565, 2.7813749999836546], "has_warnings": true, "reps": 5, "warmup": 2}, "compile_ms": 17.938291999996636, "peak_bytes": null, "ok": false, "absmax": 0.09098148345947266, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.09098148345947266, "mae": 0.0008892239420674741, "mse": 4.500504473980982e-06, "ref": "swiglu_fp32"}, "err": null} diff --git a/activation/impls/cells/benchmark.py b/activation/impls/cells/benchmark.py index 711af9e01652ef5081b507affd0f7df9ac99e644..fd785e205d9fbcbf9a01065929fe8402a83fcf03 100644 --- a/activation/impls/cells/benchmark.py +++ b/activation/impls/cells/benchmark.py @@ -22,7 +22,7 @@ def swiglu_eager(x): run_benchmark( kernel_type=KernelTypeEnum.ACTIVATION, - impl_name="torch_eager", - impl_tags={"family":"hf-kernels", "backend":"eager"}, + impl_name="torch_eager_darwin", + impl_tags={"family":"pytorch", "backend":"eager", "platform": "darwin"}, impl_func=swiglu_eager, ) \ No newline at end of file diff --git a/activation/impls/cells/sysinfo.py b/activation/impls/cells/sysinfo.py new file mode 100644 index 0000000000000000000000000000000000000000..b2c8722bdf08e8bd2c4f4a673833bd0e5db39fc0 --- /dev/null +++ b/activation/impls/cells/sysinfo.py @@ -0,0 +1,14 @@ +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "torch==2.8.0", +# ] +# /// +import platform +import subprocess +print(f"Platform: {platform.system()} {platform.machine()}") +print(f"Python: {platform.python_version()}") +# Check for MPS availability +import torch +print(f"PyTorch: {torch.__version__}") +print(f"MPS available: {torch.backends.mps.is_available()}") \ No newline at end of file diff --git a/activation/impls/hf_kernels_swiglu.html b/activation/impls/hf_kernels_swiglu.html index f8d816a75a59b5a7e634d3e627b3d6b3842f78fa..27593f80515fb851d5c18e04f950f201441aca78 100644 --- a/activation/impls/hf_kernels_swiglu.html +++ b/activation/impls/hf_kernels_swiglu.html @@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content { ▼ output ▶ uv-logs | -Cell: nv | 0.29s +Cell: nv | 0.25s | Raw @@ -3905,7 +3905,7 @@ Cell: nv | 0.29s
-
Fri Dec 19 19:54:13 2025       
+
Fri Dec 19 23:01:11 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 580.105.08             Driver Version: 580.105.08     CUDA Version: 13.0     |
 +-----------------------------------------+------------------------+----------------------+
@@ -3914,7 +3914,7 @@ Cell: nv | 0.29s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   35C    P0            120W /  350W |       0MiB /  46068MiB |    100%      Default |
+| N/A   39C    P0             82W /  350W |       0MiB /  46068MiB |     10%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -3938,7 +3938,7 @@ Cell: nv | 0.29s
 ▼ output
  ▶ uv-logs
  | 
-Cell: benchmark | 8.35s
+Cell: benchmark | 8.49s
  | 
 
 Raw
@@ -3995,16 +3995,16 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D768
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      85.600us      2073.64%      85.600us      85.600us             1  
-                                      hf_kernels_swiglu         8.76%     183.666us        99.29%       2.081ms       2.081ms       0.000us         0.00%       5.568us       5.568us             1  
-                      _activation_23bf3fb::silu_and_mul         0.98%      20.570us        88.50%       1.855ms     618.341us       4.128us       100.00%       5.568us       1.856us             3  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      76.129us      1844.21%      76.129us      76.129us             1  
+                                      hf_kernels_swiglu         8.60%     174.603us        99.27%       2.015ms       2.015ms       0.000us         0.00%       5.568us       5.568us             1  
+                      _activation_23bf3fb::silu_and_mul         0.97%      19.670us        88.54%       1.797ms     599.020us       4.128us       100.00%       5.568us       1.856us             3  
 void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       4.128us       100.00%       4.128us       1.376us             3  
-                                Activity Buffer Request        85.39%       1.790ms        85.39%       1.790ms       1.790ms       1.440us        34.88%       1.440us       1.440us             1  
-                                            aten::empty         2.03%      42.471us         2.03%      42.471us      14.157us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         2.13%      44.611us         2.13%      44.611us      14.870us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.71%      14.820us         0.71%      14.820us      14.820us       0.000us         0.00%       0.000us       0.000us             1  
+                                Activity Buffer Request        85.37%       1.733ms        85.37%       1.733ms       1.733ms       1.440us        34.88%       1.440us       1.440us             1  
+                                            aten::empty         2.13%      43.191us         2.13%      43.191us      14.397us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         2.20%      44.752us         2.20%      44.752us      14.917us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.73%      14.741us         0.73%      14.741us      14.741us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.096ms
+Self CPU time total: 2.030ms
 Self CUDA time total: 4.128us
 
 
@@ -4015,17 +4015,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      66.111us      1666.52%      66.111us      66.111us             1  
-                                      hf_kernels_swiglu         4.94%      94.004us        99.69%       1.897ms       1.897ms       0.000us         0.00%       5.311us       5.311us             1  
-                      _activation_23bf3fb::silu_and_mul         0.99%      18.841us        93.73%       1.783ms     594.417us       3.967us       100.00%       5.311us       1.770us             3  
-void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       3.967us       100.00%       3.967us       1.322us             3  
-                                Activity Buffer Request        91.36%       1.738ms        91.36%       1.738ms       1.738ms       1.344us        33.88%       1.344us       1.344us             1  
-                                            aten::empty         1.01%      19.260us         1.01%      19.260us       6.420us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         1.38%      26.230us         1.38%      26.230us       8.743us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.31%       5.950us         0.31%       5.950us       5.950us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      62.783us      1582.23%      62.783us      62.783us             1  
+                                      hf_kernels_swiglu         4.95%      92.601us        99.70%       1.863ms       1.863ms       0.000us         0.00%       5.312us       5.312us             1  
+                      _activation_23bf3fb::silu_and_mul         1.25%      23.392us        93.77%       1.753ms     584.220us       3.968us       100.00%       5.312us       1.771us             3  
+void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       3.968us       100.00%       3.968us       1.323us             3  
+                                Activity Buffer Request        91.17%       1.704ms        91.17%       1.704ms       1.704ms       1.344us        33.87%       1.344us       1.344us             1  
+                                            aten::empty         0.97%      18.160us         0.97%      18.160us       6.053us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         1.35%      25.221us         1.35%      25.221us       8.407us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.30%       5.620us         0.30%       5.620us       5.620us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.902ms
-Self CUDA time total: 3.967us
+Self CPU time total: 1.869ms
+Self CUDA time total: 3.968us
 
 
 
@@ -4035,17 +4035,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      68.479us      1380.35%      68.479us      68.479us             1  
-                                      hf_kernels_swiglu         4.69%      88.684us        99.71%       1.886ms       1.886ms       0.000us         0.00%       6.625us       6.625us             1  
-                      _activation_23bf3fb::silu_and_mul         0.99%      18.661us        94.04%       1.778ms     592.827us       4.961us       100.00%       6.625us       2.208us             3  
-void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       4.961us       100.00%       4.961us       1.654us             3  
-                                Activity Buffer Request        91.53%       1.731ms        91.53%       1.731ms       1.731ms       1.664us        33.54%       1.664us       1.664us             1  
-                                            aten::empty         0.98%      18.610us         0.98%      18.610us       6.203us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         1.52%      28.800us         1.52%      28.800us       9.600us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.29%       5.500us         0.29%       5.500us       5.500us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      61.887us      1264.03%      61.887us      61.887us             1  
+                                      hf_kernels_swiglu         4.90%      91.392us        99.70%       1.861ms       1.861ms       0.000us         0.00%       6.528us       6.528us             1  
+                      _activation_23bf3fb::silu_and_mul         1.06%      19.772us        93.81%       1.751ms     583.690us       4.896us       100.00%       6.528us       2.176us             3  
+void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       4.896us       100.00%       4.896us       1.632us             3  
+                                Activity Buffer Request        91.42%       1.706ms        91.42%       1.706ms       1.706ms       1.632us        33.33%       1.632us       1.632us             1  
+                                            aten::empty         1.00%      18.580us         1.00%      18.580us       6.193us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         1.33%      24.870us         1.33%      24.870us       8.290us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.30%       5.640us         0.30%       5.640us       5.640us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.891ms
-Self CUDA time total: 4.961us
+Self CPU time total: 1.867ms
+Self CUDA time total: 4.896us
 
 
 
@@ -4055,17 +4055,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D768
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      66.368us      1547.76%      66.368us      66.368us             1  
-                                      hf_kernels_swiglu         4.25%      87.402us        99.76%       2.051ms       2.051ms       0.000us         0.00%       5.760us       5.760us             1  
-                      _activation_23bf3fb::silu_and_mul         0.97%      19.981us        94.58%       1.945ms     648.228us       4.288us       100.00%       5.760us       1.920us             3  
-void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       4.288us       100.00%       4.288us       1.429us             3  
-                                Activity Buffer Request        83.83%       1.724ms        83.83%       1.724ms       1.724ms       1.472us        34.33%       1.472us       1.472us             1  
-                                            aten::empty         0.93%      19.111us         0.93%      19.111us       6.370us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         9.77%     200.885us         9.77%     200.885us      66.962us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.24%       5.020us         0.24%       5.020us       5.020us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      66.431us      1560.88%      66.431us      66.431us             1  
+                                      hf_kernels_swiglu         4.62%      96.552us        99.72%       2.084ms       2.084ms       0.000us         0.00%       5.696us       5.696us             1  
+                      _activation_23bf3fb::silu_and_mul         0.92%      19.230us        94.20%       1.969ms     656.267us       4.256us       100.00%       5.696us       1.899us             3  
+void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       4.256us       100.00%       4.256us       1.419us             3  
+                                Activity Buffer Request        82.63%       1.727ms        82.63%       1.727ms       1.727ms       1.440us        33.83%       1.440us       1.440us             1  
+                                            aten::empty         0.91%      18.961us         0.91%      18.961us       6.320us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        10.64%     222.454us        10.64%     222.454us      74.151us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.28%       5.800us         0.28%       5.800us       5.800us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.056ms
-Self CUDA time total: 4.288us
+Self CPU time total: 2.090ms
+Self CUDA time total: 4.256us
 
 
 
@@ -4075,17 +4075,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      67.360us      1131.72%      67.360us      67.360us             1  
-                                      hf_kernels_swiglu         4.31%      89.293us        99.77%       2.067ms       2.067ms       0.000us         0.00%       7.968us       7.968us             1  
-                      _activation_23bf3fb::silu_and_mul         0.98%      20.220us        94.55%       1.959ms     652.859us       5.952us       100.00%       7.968us       2.656us             3  
-void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       5.952us       100.00%       5.952us       1.984us             3  
-                                Activity Buffer Request        85.78%       1.777ms        85.78%       1.777ms       1.777ms       2.016us        33.87%       2.016us       2.016us             1  
-                                            aten::empty         0.91%      18.861us         0.91%      18.861us       6.287us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         7.79%     161.464us         7.79%     161.464us      53.821us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.23%       4.820us         0.23%       4.820us       4.820us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      62.753us      1065.60%      62.753us      62.753us             1  
+                                      hf_kernels_swiglu         4.32%      90.233us        99.73%       2.084ms       2.084ms       0.000us         0.00%       7.842us       7.842us             1  
+                      _activation_23bf3fb::silu_and_mul         0.98%      20.530us        94.51%       1.975ms     658.421us       5.889us       100.00%       7.842us       2.614us             3  
+void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       5.889us       100.00%       5.889us       1.963us             3  
+                                Activity Buffer Request        83.43%       1.744ms        83.43%       1.744ms       1.744ms       1.953us        33.16%       1.953us       1.953us             1  
+                                            aten::empty         0.90%      18.820us         0.90%      18.820us       6.273us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        10.09%     210.974us        10.09%     210.974us      70.325us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.27%       5.680us         0.27%       5.680us       5.680us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.072ms
-Self CUDA time total: 5.952us
+Self CPU time total: 2.090ms
+Self CUDA time total: 5.889us
 
 
 
@@ -4095,17 +4095,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      64.574us       830.43%      64.574us      64.574us             1  
-                                      hf_kernels_swiglu        18.42%      86.111us        98.86%     462.073us     462.073us       0.000us         0.00%      10.367us      10.367us             1  
-                      _activation_23bf3fb::silu_and_mul         4.27%      19.980us        76.48%     357.451us     119.150us       7.776us       100.00%      10.367us       3.456us             3  
-void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       7.776us       100.00%       7.776us       2.592us             3  
-                                Activity Buffer Request        38.90%     181.805us        38.90%     181.805us     181.805us       2.591us        33.32%       2.591us       2.591us             1  
-                                            aten::empty         3.96%      18.511us         3.96%      18.511us       6.170us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        33.30%     155.666us        33.30%     155.666us      51.889us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.14%       5.330us         1.14%       5.330us       5.330us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      58.974us       761.74%      58.974us      58.974us             1  
+                                      hf_kernels_swiglu        14.39%      83.563us        99.11%     575.543us     575.543us       0.000us         0.00%      10.333us      10.333us             1  
+                      _activation_23bf3fb::silu_and_mul         3.37%      19.590us        81.67%     474.270us     158.090us       7.742us       100.00%      10.333us       3.444us             3  
+void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       7.742us       100.00%       7.742us       2.581us             3  
+                                Activity Buffer Request        43.30%     251.476us        43.30%     251.476us     251.476us       2.591us        33.47%       2.591us       2.591us             1  
+                                            aten::empty         3.05%      17.710us         3.05%      17.710us       5.903us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        34.99%     203.204us        34.99%     203.204us      67.735us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.89%       5.190us         0.89%       5.190us       5.190us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 467.403us
-Self CUDA time total: 7.776us
+Self CPU time total: 580.733us
+Self CUDA time total: 7.742us
 
 
 
@@ -4115,16 +4115,16 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D768
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      62.527us       943.95%      62.527us      62.527us             1  
-                                      hf_kernels_swiglu        18.86%      83.092us        98.85%     435.523us     435.523us       0.000us         0.00%       8.832us       8.832us             1  
-                      _activation_23bf3fb::silu_and_mul         4.63%      20.380us        75.83%     334.080us     111.360us       6.624us       100.00%       8.832us       2.944us             3  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      60.191us       908.68%      60.191us      60.191us             1  
+                                      hf_kernels_swiglu        14.49%      83.902us        99.19%     574.293us     574.293us       0.000us         0.00%       8.832us       8.832us             1  
+                      _activation_23bf3fb::silu_and_mul         3.38%      19.561us        81.54%     472.101us     157.367us       6.624us       100.00%       8.832us       2.944us             3  
 void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       6.624us       100.00%       6.624us       2.208us             3  
-                                Activity Buffer Request        36.44%     160.555us        36.44%     160.555us     160.555us       2.208us        33.33%       2.208us       2.208us             1  
-                                            aten::empty         4.17%      18.351us         4.17%      18.351us       6.117us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        34.76%     153.145us        34.76%     153.145us      51.048us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.15%       5.060us         1.15%       5.060us       5.060us       0.000us         0.00%       0.000us       0.000us             1  
+                                Activity Buffer Request        43.39%     251.205us        43.39%     251.205us     251.205us       2.208us        33.33%       2.208us       2.208us             1  
+                                            aten::empty         3.16%      18.290us         3.16%      18.290us       6.097us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        34.77%     201.335us        34.77%     201.335us      67.112us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.81%       4.680us         0.81%       4.680us       4.680us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 440.583us
+Self CPU time total: 578.973us
 Self CUDA time total: 6.624us
 
 
@@ -4135,17 +4135,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      69.184us       732.88%      69.184us      69.184us             1  
-                                      hf_kernels_swiglu         4.54%      90.562us        99.76%       1.988ms       1.988ms       0.000us         0.00%      12.608us      12.608us             1  
-                      _activation_23bf3fb::silu_and_mul         1.02%      20.260us        94.19%       1.877ms     625.705us       9.440us       100.00%      12.608us       4.203us             3  
-void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       9.440us       100.00%       9.440us       3.147us             3  
-                                Activity Buffer Request        85.41%       1.702ms        85.41%       1.702ms       1.702ms       3.168us        33.56%       3.168us       3.168us             1  
-                                            aten::empty         1.03%      20.450us         1.03%      20.450us       6.817us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         7.76%     154.666us         7.76%     154.666us      51.555us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.24%       4.870us         0.24%       4.870us       4.870us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      64.480us       685.45%      64.480us      64.480us             1  
+                                      hf_kernels_swiglu         4.47%      90.662us        99.76%       2.023ms       2.023ms       0.000us         0.00%      12.543us      12.543us             1  
+                      _activation_23bf3fb::silu_and_mul         0.98%      19.960us        94.38%       1.913ms     637.817us       9.407us       100.00%      12.543us       4.181us             3  
+void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       9.407us       100.00%       9.407us       3.136us             3  
+                                Activity Buffer Request        83.63%       1.695ms        83.63%       1.695ms       1.695ms       3.136us        33.34%       3.136us       3.136us             1  
+                                            aten::empty         0.91%      18.421us         0.91%      18.421us       6.140us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         9.77%     198.004us         9.77%     198.004us      66.001us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.24%       4.950us         0.24%       4.950us       4.950us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.993ms
-Self CUDA time total: 9.440us
+Self CPU time total: 2.027ms
+Self CUDA time total: 9.407us
 
 
 
@@ -4155,17 +4155,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      65.376us       499.51%      65.376us      65.376us             1  
-                                      hf_kernels_swiglu        19.52%      83.334us        98.75%     421.512us     421.512us       0.000us         0.00%      17.472us      17.472us             1  
-                      _activation_23bf3fb::silu_and_mul         4.53%      19.340us        74.78%     319.198us     106.399us      13.088us       100.00%      17.472us       5.824us             3  
-void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us      13.088us       100.00%      13.088us       4.363us             3  
-                                Activity Buffer Request        34.31%     146.444us        34.31%     146.444us     146.444us       4.384us        33.50%       4.384us       4.384us             1  
-                                            aten::empty         4.45%      18.980us         4.45%      18.980us       6.327us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        35.94%     153.414us        35.94%     153.414us      51.138us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.25%       5.350us         1.25%       5.350us       5.350us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      60.576us       465.11%      60.576us      60.576us             1  
+                                      hf_kernels_swiglu        15.18%      83.082us        99.12%     542.352us     542.352us       0.000us         0.00%      17.408us      17.408us             1  
+                      _activation_23bf3fb::silu_and_mul         3.66%      20.041us        80.66%     441.340us     147.113us      13.024us       100.00%      17.408us       5.803us             3  
+void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us      13.024us       100.00%      13.024us       4.341us             3  
+                                Activity Buffer Request        41.24%     225.625us        41.24%     225.625us     225.625us       4.384us        33.66%       4.384us       4.384us             1  
+                                            aten::empty         3.28%      17.930us         3.28%      17.930us       5.977us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        35.76%     195.674us        35.76%     195.674us      65.225us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.88%       4.811us         0.88%       4.811us       4.811us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 426.862us
-Self CUDA time total: 13.088us
+Self CPU time total: 547.163us
+Self CUDA time total: 13.024us
 
 
 impl                     wl                  p50(ms)  ok
@@ -4182,13 +4182,14 @@ hf_kernels_swiglu        cuda_T512_D768         0.03  True
 
▶ UV Install Logs
-
Fetching 7 files: 0%| | 0/7 [00:00<?, ?it/s] -Fetching 7 files: 14%|█▍ | 1/7 [00:00<00:01, 5.80it/s] -Fetching 7 files: 71%|███████▏ | 5/7 [00:00<00:00, 13.68it/s] -Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00, 17.69it/s]
+
Fetching 7 files: 0%| | 0/7 [00:00<?, ?it/s]Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads. + +Fetching 7 files: 29%|██▊ | 2/7 [00:00<00:00, 17.51it/s] +Fetching 7 files: 71%|███████▏ | 5/7 [00:00<00:00, 14.39it/s] +Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00, 20.57it/s]

Artifacts:

activation.jsonl diff --git a/activation/impls/index.html b/activation/impls/index.html index 02d457f5814d7ec7515a6c7ef12f11b92d7783cf..4bb1f39a085a80b189a23e35553192c7e762dcee 100644 --- a/activation/impls/index.html +++ b/activation/impls/index.html @@ -82,8 +82,7 @@

Index of /activation/impls

\ No newline at end of file diff --git a/activation/impls/torch_swiglu.html b/activation/impls/torch_swiglu.html index 0ea5672107f5277e78f049ca5bfa32c986c83270..68021532ea5e390038d81523f729bbee6e7c21ae 100644 --- a/activation/impls/torch_swiglu.html +++ b/activation/impls/torch_swiglu.html @@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content { ▼ output ▶ uv-logs | -Cell: nv | 0.29s +Cell: nv | 0.25s | Raw @@ -3904,7 +3904,7 @@ Cell: nv | 0.29s
-
Fri Dec 19 19:54:13 2025       
+
Fri Dec 19 23:01:11 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 580.105.08             Driver Version: 580.105.08     CUDA Version: 13.0     |
 +-----------------------------------------+------------------------+----------------------+
@@ -3913,7 +3913,7 @@ Cell: nv | 0.29s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   35C    P0            120W /  350W |       0MiB /  46068MiB |    100%      Default |
+| N/A   39C    P0             82W /  350W |       0MiB /  46068MiB |     10%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -3937,7 +3937,7 @@ Cell: nv | 0.29s
 ▼ output
  ▶ uv-logs
  | 
-Cell: benchmark | 3.69s
+Cell: benchmark | 3.88s
  | 
 
 Raw
@@ -3987,20 +3987,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D768
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     187.393us      1471.25%     187.393us     187.393us             1  
-                                            torch_eager         9.10%     197.603us        99.31%       2.157ms       2.157ms       0.000us         0.00%      15.073us      15.073us             1  
-                                             aten::silu         2.86%      62.203us        85.13%       1.849ms     616.358us       6.561us        51.51%       8.897us       2.966us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.561us        51.51%       6.561us       2.187us             3  
-                                              aten::mul         1.58%      34.212us         2.55%      55.432us      18.477us       6.176us        48.49%       6.176us       2.059us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     176.512us      1385.93%     176.512us     176.512us             1  
+                                            torch_eager         8.54%     185.335us        99.30%       2.155ms       2.155ms       0.000us         0.00%      15.072us      15.072us             1  
+                                             aten::silu         2.61%      56.610us        85.90%       1.864ms     621.400us       6.560us        51.51%       8.896us       2.965us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.560us        51.51%       6.560us       2.187us             3  
+                                              aten::mul         1.46%      31.580us         2.49%      54.091us      18.030us       6.176us        48.49%       6.176us       2.059us             3  
 void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.176us        48.49%       6.176us       2.059us             3  
-                                Activity Buffer Request        80.18%       1.741ms        80.18%       1.741ms       1.741ms       2.336us        18.34%       2.336us       2.336us             1  
-                                            aten::slice         2.02%      43.964us         2.53%      55.013us       9.169us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         0.51%      11.049us         0.51%      11.049us       1.842us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         3.07%      66.630us         3.07%      66.630us      11.105us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.69%      14.920us         0.69%      14.920us      14.920us       0.000us         0.00%       0.000us       0.000us             1  
+                                Activity Buffer Request        81.25%       1.763ms        81.25%       1.763ms       1.763ms       2.336us        18.34%       2.336us       2.336us             1  
+                                            aten::slice         1.93%      41.799us         2.37%      51.470us       8.578us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.45%       9.671us         0.45%       9.671us       1.612us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         3.08%      66.742us         3.08%      66.742us      11.124us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.70%      15.141us         0.70%      15.141us      15.141us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.172ms
-Self CUDA time total: 12.737us
+Self CPU time total: 2.170ms
+Self CUDA time total: 12.736us
 
 
 
@@ -4010,20 +4010,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     159.648us      1289.36%     159.648us     159.648us             1  
-                                            torch_eager         6.57%     137.523us        99.70%       2.087ms       2.087ms       0.000us         0.00%      14.526us      14.526us             1  
-                                             aten::silu         2.02%      42.391us        89.22%       1.868ms     622.711us       6.399us        51.68%       8.543us       2.848us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.399us        51.68%       6.399us       2.133us             3  
-                                              aten::mul         1.43%      29.882us         2.35%      49.282us      16.427us       5.983us        48.32%       5.983us       1.994us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.983us        48.32%       5.983us       1.994us             3  
-                                Activity Buffer Request        85.88%       1.798ms        85.88%       1.798ms       1.798ms       2.144us        17.32%       2.144us       2.144us             1  
-                                            aten::slice         1.30%      27.292us         1.55%      32.512us       5.419us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         0.25%       5.220us         0.25%       5.220us       0.870us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         2.25%      47.061us         2.25%      47.061us       7.843us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.30%       6.330us         0.30%       6.330us       6.330us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     147.103us      1187.66%     147.103us     147.103us             1  
+                                            torch_eager         6.42%     134.022us        99.73%       2.081ms       2.081ms       0.000us         0.00%      14.563us      14.563us             1  
+                                             aten::silu         1.84%      38.392us        89.66%       1.871ms     623.681us       6.401us        51.68%       8.578us       2.859us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.401us        51.68%       6.401us       2.134us             3  
+                                              aten::mul         1.30%      27.120us         2.25%      46.940us      15.647us       5.985us        48.32%       5.985us       1.995us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.985us        48.32%       5.985us       1.995us             3  
+                                Activity Buffer Request        86.49%       1.805ms        86.49%       1.805ms       1.805ms       2.177us        17.58%       2.177us       2.177us             1  
+                                            aten::slice         1.12%      23.282us         1.39%      29.102us       4.850us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.28%       5.820us         0.28%       5.820us       0.970us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         2.28%      47.661us         2.28%      47.661us       7.944us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.27%       5.671us         0.27%       5.671us       5.671us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.094ms
-Self CUDA time total: 12.382us
+Self CPU time total: 2.087ms
+Self CUDA time total: 12.386us
 
 
 
@@ -4033,20 +4033,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     155.680us      1172.29%     155.680us     155.680us             1  
-                                            torch_eager         6.64%     129.562us        99.68%       1.946ms       1.946ms       0.000us         0.00%      15.584us      15.584us             1  
-                                             aten::silu         2.16%      42.182us        89.10%       1.739ms     579.704us       6.848us        51.57%       9.152us       3.051us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.848us        51.57%       6.848us       2.283us             3  
-                                              aten::mul         1.46%      28.592us         2.38%      46.553us      15.518us       6.432us        48.43%       6.432us       2.144us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.432us        48.43%       6.432us       2.144us             3  
-                                Activity Buffer Request        85.60%       1.671ms        85.60%       1.671ms       1.671ms       2.304us        17.35%       2.304us       2.304us             1  
-                                            aten::slice         1.31%      25.640us         1.56%      30.540us       5.090us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         0.25%       4.900us         0.25%       4.900us       0.817us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         2.26%      44.052us         2.26%      44.052us       7.342us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.32%       6.150us         0.32%       6.150us       6.150us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     145.888us      1106.55%     145.888us     145.888us             1  
+                                            torch_eager         6.31%     124.322us        99.70%       1.963ms       1.963ms       0.000us         0.00%      15.456us      15.456us             1  
+                                             aten::silu         2.05%      40.451us        89.58%       1.764ms     587.980us       6.784us        51.46%       9.056us       3.019us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.784us        51.46%       6.784us       2.261us             3  
+                                              aten::mul         1.27%      25.091us         2.33%      45.941us      15.314us       6.400us        48.54%       6.400us       2.133us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.400us        48.54%       6.400us       2.133us             3  
+                                Activity Buffer Request        86.22%       1.698ms        86.22%       1.698ms       1.698ms       2.272us        17.23%       2.272us       2.272us             1  
+                                            aten::slice         1.19%      23.361us         1.47%      29.031us       4.839us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.29%       5.670us         0.29%       5.670us       0.945us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         2.36%      46.481us         2.36%      46.481us       7.747us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.30%       5.880us         0.30%       5.880us       5.880us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.952ms
-Self CUDA time total: 13.280us
+Self CPU time total: 1.969ms
+Self CUDA time total: 13.184us
 
 
 
@@ -4056,20 +4056,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D768
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     160.289us      1264.91%     160.289us     160.289us             1  
-                                            torch_eager         6.06%     136.754us        99.75%       2.252ms       2.252ms       0.000us         0.00%      14.880us      14.880us             1  
-                                             aten::silu         1.87%      42.159us        90.17%       2.036ms     678.503us       6.560us        51.77%       8.768us       2.923us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.560us        51.77%       6.560us       2.187us             3  
-                                              aten::mul         1.25%      28.231us         2.20%      49.632us      16.544us       6.112us        48.23%       6.112us       2.037us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.112us        48.23%       6.112us       2.037us             3  
-                                Activity Buffer Request        79.28%       1.790ms        79.28%       1.790ms       1.790ms       2.208us        17.42%       2.208us       2.208us             1  
-                                            aten::slice         1.09%      24.671us         1.32%      29.801us       4.967us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         0.23%       5.130us         0.23%       5.130us       0.855us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         9.98%     225.208us         9.98%     225.208us      37.535us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.25%       5.621us         0.25%       5.621us       5.621us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     156.959us      1244.82%     156.959us     156.959us             1  
+                                            torch_eager         5.42%     122.252us        99.77%       2.252ms       2.252ms       0.000us         0.00%      14.785us      14.785us             1  
+                                             aten::silu         1.78%      40.211us        90.81%       2.050ms     683.202us       6.497us        51.53%       8.673us       2.891us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.497us        51.53%       6.497us       2.166us             3  
+                                              aten::mul         1.27%      28.640us         2.19%      49.471us      16.490us       6.112us        48.47%       6.112us       2.037us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.112us        48.47%       6.112us       2.037us             3  
+                                Activity Buffer Request        80.04%       1.807ms        80.04%       1.807ms       1.807ms       2.176us        17.26%       2.176us       2.176us             1  
+                                            aten::slice         1.10%      24.730us         1.36%      30.660us       5.110us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.26%       5.930us         0.26%       5.930us       0.988us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         9.91%     223.637us         9.91%     223.637us      37.273us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.23%       5.130us         0.23%       5.130us       5.130us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
 Self CPU time total: 2.257ms
-Self CUDA time total: 12.672us
+Self CUDA time total: 12.609us
 
 
 
@@ -4079,20 +4079,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     159.295us      1196.72%     159.295us     159.295us             1  
-                                            torch_eager         6.43%     135.135us        99.75%       2.096ms       2.096ms       0.000us         0.00%      15.615us      15.615us             1  
-                                             aten::silu         2.00%      41.931us        89.60%       1.883ms     627.518us       6.815us        51.20%       9.119us       3.040us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.815us        51.20%       6.815us       2.272us             3  
-                                              aten::mul         1.42%      29.749us         2.27%      47.691us      15.897us       6.496us        48.80%       6.496us       2.165us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.496us        48.80%       6.496us       2.165us             3  
-                                Activity Buffer Request        79.61%       1.673ms        79.61%       1.673ms       1.673ms       2.304us        17.31%       2.304us       2.304us             1  
-                                            aten::slice         1.22%      25.650us         1.46%      30.630us       5.105us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         0.24%       4.980us         0.24%       4.980us       0.830us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         8.84%     185.847us         8.84%     185.847us      30.974us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.25%       5.161us         0.25%       5.161us       5.161us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     160.703us      1215.97%     160.703us     160.703us             1  
+                                            torch_eager         6.46%     135.762us        99.74%       2.098ms       2.098ms       0.000us         0.00%      15.488us      15.488us             1  
+                                             aten::silu         1.92%      40.421us        89.37%       1.880ms     626.541us       6.816us        51.57%       9.088us       3.029us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.816us        51.57%       6.816us       2.272us             3  
+                                              aten::mul         1.37%      28.851us         2.33%      49.101us      16.367us       6.400us        48.43%       6.400us       2.133us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.400us        48.43%       6.400us       2.133us             3  
+                                Activity Buffer Request        79.67%       1.676ms        79.67%       1.676ms       1.676ms       2.272us        17.19%       2.272us       2.272us             1  
+                                            aten::slice         1.24%      26.071us         1.57%      33.081us       5.513us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.33%       7.010us         0.33%       7.010us       1.168us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         8.75%     183.945us         8.75%     183.945us      30.657us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.26%       5.530us         0.26%       5.530us       5.530us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.101ms
-Self CUDA time total: 13.311us
+Self CPU time total: 2.103ms
+Self CUDA time total: 13.216us
 
 
 
@@ -4102,20 +4102,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     158.432us      1014.55%     158.432us     158.432us             1  
-                                            torch_eager         6.38%     140.261us        99.75%       2.192ms       2.192ms       0.000us         0.00%      18.304us      18.304us             1  
-                                             aten::silu         1.93%      42.492us        89.81%       1.973ms     657.799us       8.000us        51.23%      10.688us       3.563us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       8.000us        51.23%       8.000us       2.667us             3  
-                                              aten::mul         1.27%      27.872us         2.10%      46.122us      15.374us       7.616us        48.77%       7.616us       2.539us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.616us        48.77%       7.616us       2.539us             3  
-                                Activity Buffer Request        80.61%       1.771ms        80.61%       1.771ms       1.771ms       2.688us        17.21%       2.688us       2.688us             1  
-                                            aten::slice         1.22%      26.832us         1.46%      31.992us       5.332us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         0.23%       5.160us         0.23%       5.160us       0.860us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         8.09%     177.845us         8.09%     177.845us      29.641us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.25%       5.530us         0.25%       5.530us       5.530us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     160.606us      1034.83%     160.606us     160.606us             1  
+                                            torch_eager         5.99%     133.963us        99.76%       2.233ms       2.233ms       0.000us         0.00%      18.208us      18.208us             1  
+                                             aten::silu         1.79%      40.170us        90.10%       2.017ms     672.181us       7.936us        51.13%      10.624us       3.541us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.936us        51.13%       7.936us       2.645us             3  
+                                              aten::mul         1.29%      28.971us         2.18%      48.701us      16.234us       7.584us        48.87%       7.584us       2.528us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.584us        48.87%       7.584us       2.528us             3  
+                                Activity Buffer Request        81.23%       1.818ms        81.23%       1.818ms       1.818ms       2.688us        17.32%       2.688us       2.688us             1  
+                                            aten::slice         1.18%      26.440us         1.50%      33.480us       5.580us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.31%       7.040us         0.31%       7.040us       1.173us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         7.96%     178.055us         7.96%     178.055us      29.676us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.24%       5.430us         0.24%       5.430us       5.430us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.197ms
-Self CUDA time total: 15.616us
+Self CPU time total: 2.238ms
+Self CUDA time total: 15.520us
 
 
 
@@ -4125,20 +4125,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D768
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     158.083us      1097.65%     158.083us     158.083us             1  
-                                            torch_eager         6.35%     128.334us        99.75%       2.015ms       2.015ms       0.000us         0.00%      16.898us      16.898us             1  
-                                             aten::silu         2.10%      42.419us        89.46%       1.807ms     602.407us       7.394us        51.34%       9.890us       3.297us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.394us        51.34%       7.394us       2.465us             3  
-                                              aten::mul         1.39%      28.141us         2.40%      48.382us      16.127us       7.008us        48.66%       7.008us       2.336us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.008us        48.66%       7.008us       2.336us             3  
-                                Activity Buffer Request        79.49%       1.606ms        79.49%       1.606ms       1.606ms       2.496us        17.33%       2.496us       2.496us             1  
-                                            aten::slice         1.27%      25.691us         1.54%      31.081us       5.180us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         0.27%       5.390us         0.27%       5.390us       0.898us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         8.88%     179.306us         8.88%     179.306us      29.884us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.25%       5.100us         0.25%       5.100us       5.100us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     161.343us      1127.96%     161.343us     161.343us             1  
+                                            torch_eager         6.15%     126.753us        99.76%       2.055ms       2.055ms       0.000us         0.00%      16.768us      16.768us             1  
+                                             aten::silu         2.04%      42.050us        89.57%       1.845ms     614.923us       7.328us        51.23%       9.792us       3.264us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.328us        51.23%       7.328us       2.443us             3  
+                                              aten::mul         1.44%      29.680us         2.44%      50.310us      16.770us       6.976us        48.77%       6.976us       2.325us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.976us        48.77%       6.976us       2.325us             3  
+                                Activity Buffer Request        78.32%       1.613ms        78.32%       1.613ms       1.613ms       2.464us        17.23%       2.464us       2.464us             1  
+                                            aten::slice         1.25%      25.802us         1.59%      32.722us       5.454us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.34%       6.920us         0.34%       6.920us       1.153us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        10.21%     210.375us        10.21%     210.375us      35.062us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.24%       4.981us         0.24%       4.981us       4.981us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.020ms
-Self CUDA time total: 14.402us
+Self CPU time total: 2.060ms
+Self CUDA time total: 14.304us
 
 
 
@@ -4148,20 +4148,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     158.658us      1020.18%     158.658us     158.658us             1  
-                                            torch_eager         5.52%     111.823us        99.73%       2.019ms       2.019ms       0.000us         0.00%      18.240us      18.240us             1  
-                                             aten::silu         2.12%      42.830us        90.25%       1.827ms     609.110us       7.936us        51.03%      10.624us       3.541us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.936us        51.03%       7.936us       2.645us             3  
-                                              aten::mul         1.37%      27.772us         2.44%      49.332us      16.444us       7.616us        48.97%       7.616us       2.539us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.616us        48.97%       7.616us       2.539us             3  
-                                Activity Buffer Request        80.18%       1.623ms        80.18%       1.623ms       1.623ms       2.688us        17.28%       2.688us       2.688us             1  
-                                            aten::slice         1.25%      25.302us         1.51%      30.641us       5.107us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         0.26%       5.339us         0.26%       5.339us       0.890us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         9.02%     182.624us         9.02%     182.624us      30.437us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.27%       5.520us         0.27%       5.520us       5.520us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     155.936us      1000.62%     155.936us     155.936us             1  
+                                            torch_eager         5.31%     107.073us        99.73%       2.011ms       2.011ms       0.000us         0.00%      18.272us      18.272us             1  
+                                             aten::silu         1.95%      39.312us        90.55%       1.825ms     608.464us       7.968us        51.13%      10.656us       3.552us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.968us        51.13%       7.968us       2.656us             3  
+                                              aten::mul         1.40%      28.240us         2.34%      47.090us      15.697us       7.616us        48.87%       7.616us       2.539us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.616us        48.87%       7.616us       2.539us             3  
+                                Activity Buffer Request        80.78%       1.628ms        80.78%       1.628ms       1.628ms       2.688us        17.25%       2.688us       2.688us             1  
+                                            aten::slice         1.22%      24.550us         1.54%      30.960us       5.160us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.32%       6.410us         0.32%       6.410us       1.068us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         8.75%     176.473us         8.75%     176.473us      29.412us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.27%       5.381us         0.27%       5.381us       5.381us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.025ms
-Self CUDA time total: 15.552us
+Self CPU time total: 2.016ms
+Self CUDA time total: 15.584us
 
 
 
@@ -4171,24 +4171,24 @@ PROFILE TRACE: torch_eager | cuda_T512_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     164.002us       726.96%     164.002us     164.002us             1  
-                                            torch_eager         5.39%     111.814us        99.74%       2.071ms       2.071ms       0.000us         0.00%      26.464us      26.464us             1  
-                                             aten::silu         2.07%      43.010us        90.61%       1.881ms     627.114us      11.616us        51.49%      15.520us       5.173us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      11.616us        51.49%      11.616us       3.872us             3  
-                                              aten::mul         1.37%      28.451us         2.32%      48.232us      16.077us      10.944us        48.51%      10.944us       3.648us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      10.944us        48.51%      10.944us       3.648us             3  
-                                Activity Buffer Request        80.76%       1.677ms        80.76%       1.677ms       1.677ms       3.904us        17.30%       3.904us       3.904us             1  
-                                            aten::slice         1.14%      23.769us         1.41%      29.310us       4.885us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         0.27%       5.541us         0.27%       5.541us       0.923us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         8.74%     181.415us         8.74%     181.415us      30.236us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.26%       5.500us         0.26%       5.500us       5.500us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     156.608us       695.20%     156.608us     156.608us             1  
+                                            torch_eager         4.97%     102.273us        99.73%       2.054ms       2.054ms       0.000us         0.00%      26.431us      26.431us             1  
+                                             aten::silu         1.93%      39.830us        90.91%       1.872ms     624.047us      11.552us        51.28%      15.456us       5.152us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      11.552us        51.28%      11.552us       3.851us             3  
+                                              aten::mul         1.40%      28.900us         2.35%      48.460us      16.153us      10.975us        48.72%      10.975us       3.658us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      10.975us        48.72%      10.975us       3.658us             3  
+                                Activity Buffer Request        81.21%       1.672ms        81.21%       1.672ms       1.672ms       3.904us        17.33%       3.904us       3.904us             1  
+                                            aten::slice         1.20%      24.753us         1.50%      30.941us       5.157us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.30%       6.188us         0.30%       6.188us       1.031us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         8.72%     179.534us         8.72%     179.534us      29.922us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.27%       5.530us         0.27%       5.530us       5.530us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.076ms
-Self CUDA time total: 22.560us
+Self CPU time total: 2.059ms
+Self CUDA time total: 22.527us
 
 
 impl                     wl                  p50(ms)  ok
-torch_eager              cuda_T128_D1024        0.06  True
+torch_eager              cuda_T128_D1024        0.05  True
 torch_eager              cuda_T128_D2048        0.05  True
 torch_eager              cuda_T128_D768         0.04  True
 torch_eager              cuda_T256_D1024        0.05  True
diff --git a/activation/impls/torch_swiglu_darwin.html b/activation/impls/torch_swiglu_darwin.html
new file mode 100644
index 0000000000000000000000000000000000000000..f62e086d563f04b0eaa65ab20c7a25c5df6957de
--- /dev/null
+++ b/activation/impls/torch_swiglu_darwin.html
@@ -0,0 +1,4346 @@
+
+
+
+    
+    
+    torch_swiglu_darwin
+
+    
+    
+    
+
+    
+    
+    
+
+
+
+
+    
+
+ + ← back + +
light
+
reset
+ +
+
+ +
+
Generated on:
+
+ Darwin arm64 | macOS-15.7.2-arm64-arm-64bit +
+
+ +
+

PyTorch Native - SwiGLU Activation (macOS)

+

System Info

+
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: sysinfo | 6.52s + | + +Raw +GitHub +
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "torch==2.8.0",
+# ]
+# ///
+import platform
+import subprocess
+print(f"Platform: {platform.system()} {platform.machine()}")
+print(f"Python: {platform.python_version()}")
+# Check for MPS availability
+import torch
+print(f"PyTorch: {torch.__version__}")
+print(f"MPS available: {torch.backends.mps.is_available()}")
+
+ +
+
+
+
+
Platform: Darwin arm64
+Python: 3.11.14
+PyTorch: 2.8.0
+MPS available: True
+
+
+
▶ UV Install Logs
+ +
+
/Users/runner/work/_temp/setup-uv-cache/environments-v2/sysinfo-29fbb5e6dd1955a1/lib/python3.11/site-packages/torch/_subclasses/functional_tensor.py:279: UserWarning: Failed to initialize NumPy: No module named 'numpy' (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/utils/tensor_numpy.cpp:81.) + cpu = _conversion_method_template(device=torch.device("cpu"))
+
+
+ +

SwiGLU Benchmark (PyTorch Native - macOS)

+
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: benchmark | 36.20s + | + +Raw +GitHub +
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "numpy",
+#     "torch==2.8.0",
+#     "kernels-benchmark-tools",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+import torch
+import sys
+from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
+import torch, torch.nn.functional as F
+
+
+def swiglu_eager(x):
+    d = x.shape[-1] // 2
+    return F.silu(x[..., :d]) * x[..., d:]
+
+
+run_benchmark(
+    kernel_type=KernelTypeEnum.ACTIVATION,
+    impl_name="torch_eager_darwin",
+    impl_tags={"family":"pytorch", "backend":"eager", "platform": "darwin"},
+    impl_func=swiglu_eager,
+)
+
+ +
+
+
+
+
Running activation benchmark on cpu with 9 workloads.
+
+======================================================================
+CORRECTNESS FAILURE: torch_eager_darwin | cpu_T128_D768
+======================================================================
+tensor([-0.1372, -0.0743, -0.1620, -0.0605,  0.9656])
+tensor([-0.1367, -0.0742, -0.1611, -0.0608,  0.9648])
+torch.Size([128, 768])
+torch.Size([128, 768])
+Diff stats:
+  max: 0.04913330078125
+  mean: 0.0008915023063309491
+  mse: 4.496400833886582e-06
+Top 5 most different elements (index: value):
+  9070: diff=0.04913330078125, out=-7.79913330078125, ref=-7.75
+  24359: diff=0.04716157913208008, out=-7.04658842086792, ref=-7.09375
+  32951: diff=0.04714775085449219, out=4.952852249145508, ref=5.0
+  73101: diff=0.04227447509765625, out=5.261024475097656, ref=5.21875
+  69062: diff=0.040175437927246094, out=-4.553574562072754, ref=-4.59375
+
+
+======================================================================
+PROFILE TRACE: torch_eager_darwin | cpu_T128_D768
+======================================================================
+----------------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                  Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
+----------------------  ------------  ------------  ------------  ------------  ------------  ------------  
+    torch_eager_darwin        44.59%       6.820ms       100.00%      15.295ms      15.295ms             1  
+            aten::silu        28.59%       4.373ms        28.59%       4.373ms       1.458ms             3  
+           aten::slice        22.18%       3.392ms        22.42%       3.429ms     571.467us             6  
+             aten::mul         4.41%     673.951us         4.41%     673.951us     224.650us             3  
+      aten::as_strided         0.24%      36.334us         0.24%      36.334us       6.056us             6  
+----------------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 15.295ms
+
+
+
+======================================================================
+CORRECTNESS FAILURE: torch_eager_darwin | cpu_T128_D1024
+======================================================================
+tensor([0.2989, 0.0356, 0.0747, 0.2235, 0.3924])
+tensor([0.2988, 0.0356, 0.0742, 0.2246, 0.3926])
+torch.Size([128, 1024])
+torch.Size([128, 1024])
+Diff stats:
+  max: 0.06802082061767578
+  mean: 0.0008884685230441391
+  mse: 4.475335117604118e-06
+Top 5 most different elements (index: value):
+  98566: diff=0.06802082061767578, out=9.005520820617676, ref=8.9375
+  105439: diff=0.058165550231933594, out=-7.816834449768066, ref=-7.875
+  101051: diff=0.050549983978271484, out=-4.8005499839782715, ref=-4.75
+  108736: diff=0.04906034469604492, out=4.794689655303955, ref=4.84375
+  113765: diff=0.048272132873535156, out=-5.857977867126465, ref=-5.90625
+
+
+======================================================================
+PROFILE TRACE: torch_eager_darwin | cpu_T128_D1024
+======================================================================
+----------------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                  Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
+----------------------  ------------  ------------  ------------  ------------  ------------  ------------  
+    torch_eager_darwin        14.33%     153.966us       100.00%       1.074ms       1.074ms             1  
+            aten::silu        67.17%     721.584us        67.17%     721.584us     240.528us             3  
+             aten::mul        15.74%     169.043us        15.74%     169.043us      56.348us             3  
+           aten::slice         2.20%      23.619us         2.76%      29.702us       4.950us             6  
+      aten::as_strided         0.57%       6.083us         0.57%       6.083us       1.014us             6  
+----------------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.074ms
+
+
+
+======================================================================
+CORRECTNESS FAILURE: torch_eager_darwin | cpu_T128_D2048
+======================================================================
+tensor([-0.1836, -0.2447, -0.0512,  0.1101, -0.2620])
+tensor([-0.1826, -0.2451, -0.0510,  0.1104, -0.2617])
+torch.Size([128, 2048])
+torch.Size([128, 2048])
+Diff stats:
+  max: 0.07091712951660156
+  mean: 0.0008893357589840889
+  mse: 4.469751274882583e-06
+Top 5 most different elements (index: value):
+  179851: diff=0.07091712951660156, out=7.414667129516602, ref=7.34375
+  21382: diff=0.06355762481689453, out=-5.3114423751831055, ref=-5.375
+  210130: diff=0.059961795806884766, out=-7.690038204193115, ref=-7.75
+  11530: diff=0.05908966064453125, out=10.184089660644531, ref=10.125
+  176690: diff=0.05261039733886719, out=4.740110397338867, ref=4.6875
+
+
+======================================================================
+PROFILE TRACE: torch_eager_darwin | cpu_T128_D2048
+======================================================================
+----------------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                  Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
+----------------------  ------------  ------------  ------------  ------------  ------------  ------------  
+    torch_eager_darwin        15.48%     486.211us       100.00%       3.142ms       3.142ms             1  
+            aten::silu        66.09%       2.076ms        66.09%       2.076ms     692.102us             3  
+             aten::mul        16.13%     506.835us        16.13%     506.835us     168.945us             3  
+           aten::slice         1.78%      56.043us         2.30%      72.251us      12.042us             6  
+      aten::as_strided         0.52%      16.208us         0.52%      16.208us       2.701us             6  
+----------------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 3.142ms
+
+
+
+======================================================================
+CORRECTNESS FAILURE: torch_eager_darwin | cpu_T256_D768
+======================================================================
+tensor([-0.1372, -0.0743, -0.1620, -0.0605,  0.9656])
+tensor([-0.1367, -0.0742, -0.1611, -0.0608,  0.9648])
+torch.Size([256, 768])
+torch.Size([256, 768])
+Diff stats:
+  max: 0.04913330078125
+  mean: 0.0008873133920133114
+  mse: 4.3958548303635325e-06
+Top 5 most different elements (index: value):
+  9070: diff=0.04913330078125, out=-7.79913330078125, ref=-7.75
+  24359: diff=0.04716157913208008, out=-7.04658842086792, ref=-7.09375
+  32951: diff=0.04714775085449219, out=4.952852249145508, ref=5.0
+  154833: diff=0.0467376708984375, out=-5.7032623291015625, ref=-5.75
+  136168: diff=0.04372358322143555, out=5.8312764167785645, ref=5.875
+
+
+======================================================================
+PROFILE TRACE: torch_eager_darwin | cpu_T256_D768
+======================================================================
+----------------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                  Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
+----------------------  ------------  ------------  ------------  ------------  ------------  ------------  
+    torch_eager_darwin        14.43%     258.000us       100.00%       1.787ms       1.787ms             1  
+            aten::silu        53.16%     950.142us        53.16%     950.142us     316.714us             3  
+             aten::mul        29.94%     535.121us        29.94%     535.121us     178.374us             3  
+           aten::slice         2.01%      35.962us         2.47%      44.214us       7.369us             6  
+      aten::as_strided         0.46%       8.252us         0.46%       8.252us       1.375us             6  
+----------------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.787ms
+
+
+
+======================================================================
+CORRECTNESS FAILURE: torch_eager_darwin | cpu_T256_D1024
+======================================================================
+tensor([0.2989, 0.0356, 0.0747, 0.2235, 0.3924])
+tensor([0.2988, 0.0356, 0.0742, 0.2246, 0.3926])
+torch.Size([256, 1024])
+torch.Size([256, 1024])
+Diff stats:
+  max: 0.06802082061767578
+  mean: 0.0008889895398169756
+  mse: 4.431089109857567e-06
+Top 5 most different elements (index: value):
+  98566: diff=0.06802082061767578, out=9.005520820617676, ref=8.9375
+  105439: diff=0.058165550231933594, out=-7.816834449768066, ref=-7.875
+  101051: diff=0.050549983978271484, out=-4.8005499839782715, ref=-4.75
+  108736: diff=0.04906034469604492, out=4.794689655303955, ref=4.84375
+  113765: diff=0.048272132873535156, out=-5.857977867126465, ref=-5.90625
+
+
+======================================================================
+PROFILE TRACE: torch_eager_darwin | cpu_T256_D1024
+======================================================================
+----------------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                  Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
+----------------------  ------------  ------------  ------------  ------------  ------------  ------------  
+    torch_eager_darwin        11.50%     358.973us       100.00%       3.121ms       3.121ms             1  
+            aten::silu        62.91%       1.963ms        62.91%       1.963ms     654.428us             3  
+             aten::mul        24.31%     758.572us        24.31%     758.572us     252.857us             3  
+           aten::slice         1.04%      32.498us         1.29%      40.124us       6.687us             6  
+      aten::as_strided         0.24%       7.626us         0.24%       7.626us       1.271us             6  
+----------------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 3.121ms
+
+
+
+======================================================================
+CORRECTNESS FAILURE: torch_eager_darwin | cpu_T256_D2048
+======================================================================
+tensor([-0.1836, -0.2447, -0.0512,  0.1101, -0.2620])
+tensor([-0.1826, -0.2451, -0.0510,  0.1104, -0.2617])
+torch.Size([256, 2048])
+torch.Size([256, 2048])
+Diff stats:
+  max: 0.08395957946777344
+  mean: 0.0008889408782124519
+  mse: 4.476671620068373e-06
+Top 5 most different elements (index: value):
+  439480: diff=0.08395957946777344, out=9.978540420532227, ref=10.0625
+  179851: diff=0.07091712951660156, out=7.414667129516602, ref=7.34375
+  21382: diff=0.06355762481689453, out=-5.3114423751831055, ref=-5.375
+  210130: diff=0.059961795806884766, out=-7.690038204193115, ref=-7.75
+  11530: diff=0.05908966064453125, out=10.184089660644531, ref=10.125
+
+
+======================================================================
+PROFILE TRACE: torch_eager_darwin | cpu_T256_D2048
+======================================================================
+----------------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                  Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
+----------------------  ------------  ------------  ------------  ------------  ------------  ------------  
+    torch_eager_darwin        15.81%       1.194ms       100.00%       7.551ms       7.551ms             1  
+            aten::silu        61.23%       4.623ms        61.23%       4.623ms       1.541ms             3  
+             aten::mul        21.01%       1.586ms        21.01%       1.586ms     528.681us             3  
+           aten::slice         1.55%     117.206us         1.96%     147.833us      24.639us             6  
+      aten::as_strided         0.41%      30.627us         0.41%      30.627us       5.104us             6  
+----------------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 7.551ms
+
+
+
+======================================================================
+CORRECTNESS FAILURE: torch_eager_darwin | cpu_T512_D768
+======================================================================
+tensor([-0.1372, -0.0743, -0.1620, -0.0605,  0.9656])
+tensor([-0.1367, -0.0742, -0.1611, -0.0608,  0.9648])
+torch.Size([512, 768])
+torch.Size([512, 768])
+Diff stats:
+  max: 0.05687236785888672
+  mean: 0.0008884922135621309
+  mse: 4.399109002406476e-06
+Top 5 most different elements (index: value):
+  361157: diff=0.05687236785888672, out=-8.994372367858887, ref=-8.9375
+  324514: diff=0.04921436309814453, out=4.1382856369018555, ref=4.1875
+  9070: diff=0.04913330078125, out=-7.79913330078125, ref=-7.75
+  240463: diff=0.0480494499206543, out=-5.608200550079346, ref=-5.65625
+  24359: diff=0.04716157913208008, out=-7.04658842086792, ref=-7.09375
+
+
+======================================================================
+PROFILE TRACE: torch_eager_darwin | cpu_T512_D768
+======================================================================
+----------------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                  Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
+----------------------  ------------  ------------  ------------  ------------  ------------  ------------  
+    torch_eager_darwin        20.25%     695.500us       100.00%       3.435ms       3.435ms             1  
+            aten::silu        54.52%       1.873ms        54.52%       1.873ms     624.208us             3  
+             aten::mul        23.71%     814.334us        23.71%     814.334us     271.445us             3  
+           aten::slice         1.25%      43.042us         1.53%      52.541us       8.757us             6  
+      aten::as_strided         0.28%       9.499us         0.28%       9.499us       1.583us             6  
+----------------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 3.435ms
+
+
+
+======================================================================
+CORRECTNESS FAILURE: torch_eager_darwin | cpu_T512_D1024
+======================================================================
+tensor([0.2989, 0.0356, 0.0747, 0.2235, 0.3924])
+tensor([0.2988, 0.0356, 0.0742, 0.2246, 0.3926])
+torch.Size([512, 1024])
+torch.Size([512, 1024])
+Diff stats:
+  max: 0.06802082061767578
+  mean: 0.0008890957687981427
+  mse: 4.448749677976593e-06
+Top 5 most different elements (index: value):
+  98566: diff=0.06802082061767578, out=9.005520820617676, ref=8.9375
+  497777: diff=0.058727264404296875, out=8.433727264404297, ref=8.375
+  105439: diff=0.058165550231933594, out=-7.816834449768066, ref=-7.875
+  367857: diff=0.05556774139404297, out=7.631932258605957, ref=7.6875
+  341485: diff=0.052660465240478516, out=-4.5410895347595215, ref=-4.59375
+
+
+======================================================================
+PROFILE TRACE: torch_eager_darwin | cpu_T512_D1024
+======================================================================
+----------------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                  Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
+----------------------  ------------  ------------  ------------  ------------  ------------  ------------  
+    torch_eager_darwin        39.22%       2.899ms       100.00%       7.392ms       7.392ms             1  
+            aten::silu        44.24%       3.270ms        44.24%       3.270ms       1.090ms             3  
+             aten::mul        15.74%       1.164ms        15.74%       1.164ms     387.903us             3  
+           aten::slice         0.63%      46.920us         0.80%      59.461us       9.910us             6  
+      aten::as_strided         0.17%      12.541us         0.17%      12.541us       2.090us             6  
+----------------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 7.392ms
+
+
+
+======================================================================
+CORRECTNESS FAILURE: torch_eager_darwin | cpu_T512_D2048
+======================================================================
+tensor([-0.1836, -0.2447, -0.0512,  0.1101, -0.2620])
+tensor([-0.1826, -0.2451, -0.0510,  0.1104, -0.2617])
+torch.Size([512, 2048])
+torch.Size([512, 2048])
+Diff stats:
+  max: 0.09098148345947266
+  mean: 0.0008892239420674741
+  mse: 4.500504473980982e-06
+Top 5 most different elements (index: value):
+  869075: diff=0.09098148345947266, out=-9.403481483459473, ref=-9.3125
+  439480: diff=0.08395957946777344, out=9.978540420532227, ref=10.0625
+  179851: diff=0.07091712951660156, out=7.414667129516602, ref=7.34375
+  21382: diff=0.06355762481689453, out=-5.3114423751831055, ref=-5.375
+  776526: diff=0.060305118560791016, out=-5.377194881439209, ref=-5.4375
+
+
+======================================================================
+PROFILE TRACE: torch_eager_darwin | cpu_T512_D2048
+======================================================================
+----------------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                  Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
+----------------------  ------------  ------------  ------------  ------------  ------------  ------------  
+    torch_eager_darwin        10.59%       2.513ms       100.00%      23.734ms      23.734ms             1  
+            aten::silu        55.91%      13.269ms        55.91%      13.269ms       4.423ms             3  
+             aten::mul        32.61%       7.739ms        32.61%       7.739ms       2.580ms             3  
+           aten::slice         0.74%     176.334us         0.90%     214.043us      35.674us             6  
+      aten::as_strided         0.16%      37.709us         0.16%      37.709us       6.285us             6  
+----------------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 23.734ms
+
+
+impl                     wl                  p50(ms)  ok
+torch_eager_darwin       cpu_T128_D1024         0.18  False
+torch_eager_darwin       cpu_T128_D2048         0.38  False
+torch_eager_darwin       cpu_T128_D768          0.14  False
+torch_eager_darwin       cpu_T256_D1024         0.40  False
+torch_eager_darwin       cpu_T256_D2048         0.98  False
+torch_eager_darwin       cpu_T256_D768          0.33  False
+torch_eager_darwin       cpu_T512_D1024         1.63  False
+torch_eager_darwin       cpu_T512_D2048         3.37  False
+torch_eager_darwin       cpu_T512_D768          0.87  False
+
+
+
▶ UV Install Logs
+ +
+
Matplotlib is building the font cache; this may take a moment.
+
+

Artifacts:

+activation.jsonl +
+
+
+
+ + + \ No newline at end of file diff --git a/activation/index.html b/activation/index.html index b48c1ba4dfdc0cea36f7f63dcf04dfad1700dc82..982d850b56851f704dac0361c91b21ca13f56b51 100644 --- a/activation/index.html +++ b/activation/index.html @@ -83,7 +83,7 @@

Index of /activation

\ No newline at end of file diff --git a/activation/results_darwin/artifacts/combine/latency.svg b/activation/results_darwin/artifacts/combine/latency.svg new file mode 100644 index 0000000000000000000000000000000000000000..2d55917620ee25d590cfd72b456a8634053511fd --- /dev/null +++ b/activation/results_darwin/artifacts/combine/latency.svg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5289a42b37402c0614d1b53534a0180586b4d0b88305535d38de44de6b50881 +size 947 diff --git a/activation/results_darwin/cells/combine.py b/activation/results_darwin/cells/combine.py new file mode 100644 index 0000000000000000000000000000000000000000..ba7c3fbee8393914c94ea9d4e7535c297608dc9e --- /dev/null +++ b/activation/results_darwin/cells/combine.py @@ -0,0 +1,25 @@ +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "numpy", +# "torch==2.8.0", +# "kernels-benchmark-tools", +# "matplotlib" +# ] +# +# [tool.uv.sources] +# kernels-benchmark-tools = { path = "../../../../../tools", editable = true } +# /// +from kernels_benchmark_tools.core.visuals import generate_combined_results + +# Map display names to uvnote environment variables +cache_env_map = { + "PyTorch SwiGLU (macOS)": "UVNOTE_FILE_TORCH_SWIGLU_DARWIN_BENCHMARK", +} + +# Generate combined results with visualization +generate_combined_results( + cache_env_map=cache_env_map, + output_filename="activation.jsonl", + svg_filename="latency.svg" +) \ No newline at end of file diff --git a/activation/results_darwin/combined_results.html b/activation/results_darwin/combined_results.html new file mode 100644 index 0000000000000000000000000000000000000000..73e55de000d1656205bcfb8917db23d970c0baf5 --- /dev/null +++ b/activation/results_darwin/combined_results.html @@ -0,0 +1,4068 @@ + + + + + + SwiGLU Activation Benchmark - Combined Results (macOS) + + + + + + + + + + + + +
+
+ + ← back + +
light
+
reset
+ +
+
+ +
+
Generated on:
+
+ Darwin arm64 | macOS-15.7.2-arm64-arm-64bit +
+
+ +
+

SwiGLU Activation Benchmarks - Aggregated Results (macOS)

+

This document combines benchmark results from SwiGLU activation implementations on macOS.

+

Combined Summary and Visualization

+
+ + + + + + + 2025-12-19T22:44:10.356853 + image/svg+xml + + + Matplotlib v3.10.8, https://matplotlib.org/ + + + + + + + + + + + + + + +
+ +
+
+ +▶ code +▼ output + ▶ uv-logs + | +Cell: combine | 20.76s + | + +Raw +
+ +
+
======================================================================
+LOADING BENCHMARK DATA
+======================================================================
+✓ PyTorch SwiGLU (macOS)        : /Users/runner/work/kernels-benchmarks/kernels-benchmarks/benches/activation/impls/.uvnote/cache/f249c063d33d6e52c05f603fc062dc2bd9327140dcd61028ab7d7192fd88a5c6
+
+  ✓ Found PyTorch SwiGLU (macOS)
+     Path: /Users/runner/work/kernels-benchmarks/kernels-benchmarks/benches/activation/impls/.uvnote/cache/f249c063d33d6e52c05f603fc062dc2bd9327140dcd61028ab7d7192fd88a5c6/activation.jsonl
+
+======================================================================
+Summary: 1 found, 0 skipped, 0 missing
+======================================================================
+
+COMBINED BENCHMARK SUMMARY
+
+impl                     wl                  p50(ms)  ok
+torch_eager_darwin       cpu_T128_D1024         0.18  False
+torch_eager_darwin       cpu_T128_D2048         0.38  False
+torch_eager_darwin       cpu_T128_D768          0.14  False
+torch_eager_darwin       cpu_T256_D1024         0.40  False
+torch_eager_darwin       cpu_T256_D2048         0.98  False
+torch_eager_darwin       cpu_T256_D768          0.33  False
+torch_eager_darwin       cpu_T512_D1024         1.63  False
+torch_eager_darwin       cpu_T512_D2048         3.37  False
+torch_eager_darwin       cpu_T512_D768          0.87  False
+
+GENERATING COMBINED VISUALIZATION
+
+Loaded 9 records
+No valid records found
+✓ Visualization saved as latency.svg
+✓ SVG visualization ready!
+
+ANALYSIS COMPLETE
+Total implementations analyzed: 1
+
+Implementations included:
+  ✓ PyTorch SwiGLU (macOS)
+
+
+
▶ UV Install Logs
+ +
+
Matplotlib is building the font cache; this may take a moment.
+
+

Artifacts:

+latency.svg +
+ + + + + + + 2025-12-19T22:44:10.356853 + image/svg+xml + + + Matplotlib v3.10.8, https://matplotlib.org/ + + + + + + + + + + + + + + +
+
+
+
+
+ + + \ No newline at end of file diff --git a/activation/results_darwin/index.html b/activation/results_darwin/index.html new file mode 100644 index 0000000000000000000000000000000000000000..c66abdcf29819a436c203a7c075b590c0e34193a --- /dev/null +++ b/activation/results_darwin/index.html @@ -0,0 +1,88 @@ + + + + + + Index of /activation/results_darwin + + + +
+ ← back +
+

Index of /activation/results_darwin

+ + + \ No newline at end of file diff --git a/activation/results_linux/artifacts/combine/latency.svg b/activation/results_linux/artifacts/combine/latency.svg index 3dff0c27669b205e6e273a1d8d1fd8d41b313b9d..8ab4c9b8f98a732495797922df1100a00733f05e 100644 --- a/activation/results_linux/artifacts/combine/latency.svg +++ b/activation/results_linux/artifacts/combine/latency.svg @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e0ebb1563a7889f083e3a946d975f77f909986c659d9bdfad99579689fd355e5 -size 21476 +oid sha256:2c3c56353a22a5acffe8087c058e833d475a7a6c5bc584cf64d87b43035f7eef +size 20695 diff --git a/activation/results_linux/combined_results.html b/activation/results_linux/combined_results.html index 5c7f459902b07ca189d757cbf948f35e262a3f91..1eeb0fe9bfcbb150fd9c992ff1aecd2b645eb890 100644 --- a/activation/results_linux/combined_results.html +++ b/activation/results_linux/combined_results.html @@ -3889,7 +3889,7 @@ body[data-tool="eraser"] .main-content { - 2025-12-19T19:55:34.708013 + 2025-12-19T23:02:36.234026 image/svg+xml @@ -4038,96 +4038,83 @@ body[data-tool="eraser"] .main-content { - + - + - 0.025 + 0.025 - + - + - 0.030 + 0.030 - + - + - 0.035 + 0.035 - + - + - 0.040 + 0.040 - + - + - 0.045 + 0.045 - + - + - 0.050 - - - - - - - - - - - - - 0.055 + 0.050 @@ -4135,37 +4122,37 @@ body[data-tool="eraser"] .main-content { - + - - - - - - - - + + + + + + + + - + - - - - - - - - - + + + + + + + + + @@ -4180,14 +4167,14 @@ body[data-tool="eraser"] .main-content { - + Attention Implementation Latency - + @@ -4196,7 +4183,7 @@ body[data-tool="eraser"] .main-content { hf_kernels_swiglu - + @@ -4223,7 +4210,7 @@ body[data-tool="eraser"] .main-content { ▼ output ▶ uv-logs | -Cell: combine | 4.50s +Cell: combine | 4.66s | Raw @@ -4319,7 +4306,7 @@ hf_kernels_swiglu cuda_T256_D768 0.03 True hf_kernels_swiglu cuda_T512_D1024 0.03 True hf_kernels_swiglu cuda_T512_D2048 0.03 True hf_kernels_swiglu cuda_T512_D768 0.03 True -torch_eager cuda_T128_D1024 0.06 True +torch_eager cuda_T128_D1024 0.05 True torch_eager cuda_T128_D2048 0.05 True torch_eager cuda_T128_D768 0.04 True torch_eager cuda_T256_D1024 0.05 True @@ -4347,7 +4334,7 @@ Implementations included:
▶ UV Install Logs
@@ -4360,7 +4347,7 @@ Installed 37 packages in 206ms - 2025-12-19T19:55:34.708013 + 2025-12-19T23:02:36.234026 image/svg+xml @@ -4509,96 +4496,83 @@ Installed 37 packages in 206ms - + - + - 0.025 + 0.025 - + - + - 0.030 + 0.030 - + - + - 0.035 + 0.035 - + - + - 0.040 + 0.040 - + - + - 0.045 + 0.045 - + - + - 0.050 - - - - - - - - - - - - - 0.055 + 0.050 @@ -4606,37 +4580,37 @@ Installed 37 packages in 206ms - + - - - - - - - - + + + + + + + + - + - - - - - - - - - + + + + + + + + + @@ -4651,14 +4625,14 @@ Installed 37 packages in 206ms - + Attention Implementation Latency - + @@ -4667,7 +4641,7 @@ Installed 37 packages in 206ms hf_kernels_swiglu - + diff --git a/causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl b/causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl index 03b673ac72ba8387dcc14a2488a469b2fe2efd5c..e99d551139e4d138d4ac8e50f5806b594cc96569 100644 --- a/causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl +++ b/causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl @@ -1,24 +1,24 @@ -{"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S128_W2", "batch": 2, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04411100007928326, "p50": 0.04549100003714557, "p90": 0.046580999878642615, "mean": 0.045672999976886786, "iqr": 0.0013399999261309858, "raw_times": [0.04524099995251163, 0.04549100003714557, 0.04694099993685086, 0.046580999878642615, 0.04411100007928326], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05584099994848657, "peak_bytes": 295936, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S128_W4", "batch": 2, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05282100005388202, "p50": 0.05301199985296989, "p90": 0.053511999794864096, "mean": 0.053615399929185514, "iqr": 0.0006809998467360856, "raw_times": [0.05282100005388202, 0.05301199985296989, 0.05283099994812801, 0.055900999996083556, 0.053511999794864096], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05840199992235284, "peak_bytes": 296448, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S512_W2", "batch": 2, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05104100000608014, "p50": 0.05199099996389123, "p90": 0.052391000053830794, "mean": 0.05195319999984349, "iqr": 0.0011390000054234406, "raw_times": [0.05125200004840735, 0.05309099992700794, 0.05199099996389123, 0.05104100000608014, 0.052391000053830794], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05686200006493891, "peak_bytes": 1180672, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S512_W4", "batch": 2, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.049951999926634016, "p50": 0.052720999974553706, "p90": 0.05275099988466536, "mean": 0.05206719993111619, "iqr": 0.001500000053056283, "raw_times": [0.049951999926634016, 0.052720999974553706, 0.0536610000381188, 0.05275099988466536, 0.05125099983160908], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.056871999959184905, "peak_bytes": 1181184, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S2048_W2", "batch": 2, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04956099996888952, "p50": 0.05162100001143699, "p90": 0.05194100003791391, "mean": 0.05130520003149286, "iqr": 0.0009590000900061568, "raw_times": [0.05194100003791391, 0.052421000191316125, 0.05098199994790775, 0.04956099996888952, 0.05162100001143699], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05496100015989214, "peak_bytes": 4719616, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S2048_W4", "batch": 2, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05030099987379799, "p50": 0.05167099993741431, "p90": 0.05240099994807679, "mean": 0.05315919993336138, "iqr": 0.00083000008999079, "raw_times": [0.05030099987379799, 0.05985200004943181, 0.05240099994807679, 0.051570999858086, 0.05167099993741431], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.055181000107040745, "peak_bytes": 4720128, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S128_W2", "batch": 2, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04961100012224051, "p50": 0.05098099995848315, "p90": 0.05157100008545967, "mean": 0.05098120004731754, "iqr": 0.0007500000265281415, "raw_times": [0.04961100012224051, 0.051922000011472846, 0.05082100005893153, 0.05098099995848315, 0.05157100008545967], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.053892000096311676, "peak_bytes": 9461760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S128_W4", "batch": 2, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.049450999995315215, "p50": 0.049821000175143126, "p90": 0.05057099997429759, "mean": 0.050117000046157045, "iqr": 0.0009599998520570807, "raw_times": [0.049450999995315215, 0.049821000175143126, 0.05113099996378878, 0.05057099997429759, 0.04961100012224051], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0559909999537922, "peak_bytes": 9478144, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S512_W2", "batch": 2, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04905100013274932, "p50": 0.05060100011178292, "p90": 0.05113099996378878, "mean": 0.051737200055868016, "iqr": 0.0005899998996028444, "raw_times": [0.057362000006833114, 0.05113099996378878, 0.05060100011178292, 0.05054100006418594, 0.04905100013274932], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05664199989041663, "peak_bytes": 37773312, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S512_W4", "batch": 2, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05021099991608935, "p50": 0.051011000095968484, "p90": 0.05146100011188537, "mean": 0.05368720007936645, "iqr": 0.0007099999947968172, "raw_times": [0.05075100011708855, 0.05021099991608935, 0.06500200015580049, 0.051011000095968484, 0.05146100011188537], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05477099989548151, "peak_bytes": 37789696, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S2048_W2", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04905100013274932, "p50": 0.0506710000536259, "p90": 0.05074099999546888, "mean": 0.05034520004301157, "iqr": 0.0009190000582748326, "raw_times": [0.04905100013274932, 0.04982199993719405, 0.05074099999546888, 0.0506710000536259, 0.05144100009601971], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05602199985332845, "peak_bytes": 151019520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S2048_W4", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04956099996888952, "p50": 0.05081099993731186, "p90": 0.05169099995327997, "mean": 0.05088699999760138, "iqr": 0.001219999830937013, "raw_times": [0.04956099996888952, 0.05081099993731186, 0.05169099995327997, 0.05047100012234296, 0.051901000006182585], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09174200022243895, "peak_bytes": 151035904, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S128_W2", "batch": 4, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04988099999536644, "p50": 0.05099100008010282, "p90": 0.05105099990032613, "mean": 0.052221000032659504, "iqr": 0.0010199998996540671, "raw_times": [0.050031000000672066, 0.05105099990032613, 0.05915100018683006, 0.04988099999536644, 0.05099100008010282], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05531200008590531, "peak_bytes": 33727488, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S128_W4", "batch": 4, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05021099991608935, "p50": 0.051081000037811464, "p90": 0.05169199994270457, "mean": 0.0512771999638062, "iqr": 0.0008909998996387003, "raw_times": [0.05169199994270457, 0.05080100004306587, 0.051081000037811464, 0.05021099991608935, 0.05260099987935973], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05523100003301806, "peak_bytes": 591360, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S512_W2", "batch": 4, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05031099999541766, "p50": 0.05089100000077451, "p90": 0.05121100002725143, "mean": 0.05099300005895202, "iqr": 0.0007899998308857903, "raw_times": [0.05031099999541766, 0.05121100002725143, 0.05213100007495086, 0.05042100019636564, 0.05089100000077451], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05383200004871469, "peak_bytes": 2360320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S512_W4", "batch": 4, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04998199983674567, "p50": 0.05088100010652852, "p90": 0.05478100001710118, "mean": 0.05373740004870342, "iqr": 0.004679999847212457, "raw_times": [0.04998199983674567, 0.05478100001710118, 0.05010100016988872, 0.05088100010652852, 0.06294200011325302], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.055170999985421076, "peak_bytes": 2360832, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S2048_W2", "batch": 4, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0490120000904426, "p50": 0.050441999974282226, "p90": 0.0519509999321599, "mean": 0.05078939998384158, "iqr": 0.0017099998785852222, "raw_times": [0.0490120000904426, 0.05230099986874848, 0.050441999974282226, 0.05024100005357468, 0.0519509999321599], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.055800999916755245, "peak_bytes": 9438208, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S2048_W4", "batch": 4, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04950099992129253, "p50": 0.050521000048320275, "p90": 0.05074099999546888, "mean": 0.05043319997639628, "iqr": 0.0008590000106778461, "raw_times": [0.04988199998479104, 0.05152099993210868, 0.050521000048320275, 0.05074099999546888, 0.04950099992129253], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.054772000112279784, "peak_bytes": 9438720, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S128_W2", "batch": 4, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.049130999968838296, "p50": 0.05138200003784732, "p90": 0.05160199998499593, "mean": 0.05141159999766387, "iqr": 0.0003800000740739051, "raw_times": [0.049130999968838296, 0.05372100008571579, 0.05122199991092202, 0.05160199998499593, 0.05138200003784732], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05565100013882329, "peak_bytes": 18931712, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S128_W4", "batch": 4, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05034100013290299, "p50": 0.05126099995322875, "p90": 0.051630999905682984, "mean": 0.051170999995520106, "iqr": 0.0007999999525054591, "raw_times": [0.05126099995322875, 0.05034100013290299, 0.051630999905682984, 0.05179100003260828, 0.050830999953177525], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05449200011753419, "peak_bytes": 18948096, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S512_W2", "batch": 4, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05069099984211789, "p50": 0.05091200000606477, "p90": 0.05127099984747474, "mean": 0.07049399991956307, "iqr": 0.00038899997889529914, "raw_times": [0.05069099984211789, 0.1487140000335785, 0.05091200000606477, 0.05088199986857944, 0.05127099984747474], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.054521000038221246, "peak_bytes": 75522048, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S512_W4", "batch": 4, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.049631000138106174, "p50": 0.05295099981594831, "p90": 0.05334100001164188, "mean": 0.057009199963431456, "iqr": 0.0027099999897473026, "raw_times": [0.049631000138106174, 0.05295099981594831, 0.05334100001164188, 0.05063100002189458, 0.07849199982956634], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05502199996954005, "peak_bytes": 75538432, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S2048_W2", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0506710000536259, "p50": 0.05091100001664017, "p90": 0.051161999863325036, "mean": 0.0510071999997308, "iqr": 0.00031099989428184927, "raw_times": [0.05091100001664017, 0.0506710000536259, 0.05085099996904319, 0.05144100009601971, 0.051161999863325036], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.057221000133722555, "peak_bytes": 302014464, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S2048_W4", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04936100003760657, "p50": 0.050112000053559314, "p90": 0.05135099991093739, "mean": 0.050513199994384195, "iqr": 0.0015199998415482696, "raw_times": [0.04936100003760657, 0.05135099991093739, 0.04983100006938912, 0.05191099990042858, 0.050112000053559314], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05625199992209673, "peak_bytes": 302030848, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S128_W2", "batch": 2, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04374100012682902, "p50": 0.046281000095405034, "p90": 0.04699100009020185, "mean": 0.04588520009747299, "iqr": 0.0018789999103319133, "raw_times": [0.0473009999950591, 0.04699100009020185, 0.04511200017986994, 0.046281000095405034, 0.04374100012682902], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.054351000017049955, "peak_bytes": 295936, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S128_W4", "batch": 2, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05076099978396087, "p50": 0.05160099999557133, "p90": 0.0517210000907653, "mean": 0.05158899998605193, "iqr": 0.00034000004234258085, "raw_times": [0.05076099978396087, 0.052481000011539436, 0.05138100004842272, 0.0517210000907653, 0.05160099999557133], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.06265199999688775, "peak_bytes": 296448, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S512_W2", "batch": 2, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04984099996363511, "p50": 0.05232199987403874, "p90": 0.05389100010688708, "mean": 0.05356520000532328, "iqr": 0.0016999999843392288, "raw_times": [0.04984099996363511, 0.05389100010688708, 0.05232199987403874, 0.05958099995950761, 0.05219100012254785], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05472199995892879, "peak_bytes": 1180672, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S512_W4", "batch": 2, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.050031999990096665, "p50": 0.0509510000483715, "p90": 0.0512909998633404, "mean": 0.05097519997434574, "iqr": 0.0009099996987060877, "raw_times": [0.050381000164634315, 0.0512909998633404, 0.05222099980528583, 0.0509510000483715, 0.050031999990096665], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05585200005953084, "peak_bytes": 1181184, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S2048_W2", "batch": 2, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.048549999974056846, "p50": 0.04953099983140419, "p90": 0.04999000020688982, "mean": 0.04990460001863539, "iqr": 0.0006790000952605624, "raw_times": [0.04953099983140419, 0.052140999969196855, 0.04999000020688982, 0.049311000111629255, 0.048549999974056846], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.054900999884921475, "peak_bytes": 4719616, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S2048_W4", "batch": 2, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04891100002168969, "p50": 0.04958099998475518, "p90": 0.05115099997965444, "mean": 0.05000500000278407, "iqr": 0.002170000016121776, "raw_times": [0.04891100002168969, 0.05140100006428838, 0.04958099998475518, 0.05115099997965444, 0.04898099996353267], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05376100011744711, "peak_bytes": 4720128, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S128_W2", "batch": 2, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04850100003750413, "p50": 0.04982199993719405, "p90": 0.04984099996363511, "mean": 0.04968119997101894, "iqr": 0.0003499999365885742, "raw_times": [0.04850100003750413, 0.04984099996363511, 0.04949100002704654, 0.050750999889714876, 0.04982199993719405], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.054501000022355583, "peak_bytes": 9461760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S128_W4", "batch": 2, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.049300999990009586, "p50": 0.04983100006938912, "p90": 0.050170999884358025, "mean": 0.05779919997621619, "iqr": 0.0004900000476482091, "raw_times": [0.09001200010061439, 0.04983100006938912, 0.049680999836709816, 0.050170999884358025, 0.049300999990009586], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.052752000101463636, "peak_bytes": 9478144, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S512_W2", "batch": 2, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.047761000132595655, "p50": 0.04934100002174091, "p90": 0.049350999915986904, "mean": 0.0488690000565839, "iqr": 0.0008099998467514524, "raw_times": [0.04854100006923545, 0.04935100014336058, 0.049350999915986904, 0.04934100002174091, 0.047761000132595655], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.052481000011539436, "peak_bytes": 37773312, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S512_W4", "batch": 2, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.048450999884153134, "p50": 0.04965099992659816, "p90": 0.050621000127648585, "mean": 0.05171900002096663, "iqr": 0.0015900000107649248, "raw_times": [0.048450999884153134, 0.04903100011688366, 0.04965099992659816, 0.050621000127648585, 0.06084100004954962], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.053670999932364793, "peak_bytes": 37789696, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S2048_W2", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.047580999989804695, "p50": 0.0487410000005184, "p90": 0.050201000021843356, "mean": 0.04913100001431303, "iqr": 0.002320000021427404, "raw_times": [0.047580999989804695, 0.050201000021843356, 0.051251000058982754, 0.04788100000041595, 0.0487410000005184], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.054761000001235516, "peak_bytes": 151019520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S2048_W4", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.049270999852524255, "p50": 0.049540999953023857, "p90": 0.04967099994246382, "mean": 0.049752999939300935, "iqr": 0.0003000000106112566, "raw_times": [0.049370999931852566, 0.049540999953023857, 0.049270999852524255, 0.05091100001664017, 0.04967099994246382], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.053771000011693104, "peak_bytes": 151035904, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S128_W2", "batch": 4, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04857099997934711, "p50": 0.04948099990542687, "p90": 0.049690999958329485, "mean": 0.04942899995512562, "iqr": 0.0004900000476482091, "raw_times": [0.04857099997934711, 0.04948099990542687, 0.049690999958329485, 0.049200999910681276, 0.050201000021843356], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05436199990072055, "peak_bytes": 33727488, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S128_W4", "batch": 4, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.048160999995161546, "p50": 0.049621000016486505, "p90": 0.05022099981033534, "mean": 0.04960719993505336, "iqr": 0.0007689998255955288, "raw_times": [0.048160999995161546, 0.049621000016486505, 0.05022099981033534, 0.049451999984739814, 0.050580999868543586], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.053051000122650294, "peak_bytes": 591360, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S512_W2", "batch": 4, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04980199992132839, "p50": 0.05055099995843193, "p90": 0.050551000185805606, "mean": 0.05498319997059298, "iqr": 0.00047000025915622246, "raw_times": [0.07393099986074958, 0.05008099992664938, 0.05055099995843193, 0.050551000185805606, 0.04980199992132839], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.053621999995812075, "peak_bytes": 2360320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S512_W4", "batch": 4, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.048960999947667005, "p50": 0.05131100010657974, "p90": 0.052661000154330395, "mean": 0.051906999988204916, "iqr": 0.0029200002700235927, "raw_times": [0.048960999947667005, 0.052661000154330395, 0.056860999848140636, 0.0497409998843068, 0.05131100010657974], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.053932000128043, "peak_bytes": 2360832, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S2048_W2", "batch": 4, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04804100012734125, "p50": 0.04966200003764243, "p90": 0.05004099989491806, "mean": 0.04977120001967705, "iqr": 0.0007399999049084727, "raw_times": [0.04804100012734125, 0.05181100004847394, 0.04966200003764243, 0.05004099989491806, 0.049300999990009586], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05391199988480366, "peak_bytes": 9438208, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S2048_W4", "batch": 4, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.047730999995110324, "p50": 0.04978099991603813, "p90": 0.049860999979500775, "mean": 0.04935500001010951, "iqr": 0.0009599998520570807, "raw_times": [0.047730999995110324, 0.04978099991603813, 0.05050100003245461, 0.048901000127443695, 0.049860999979500775], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0537809999059391, "peak_bytes": 9438720, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S128_W2", "batch": 4, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.049270999852524255, "p50": 0.05024100005357468, "p90": 0.05084099984742352, "mean": 0.050164999993285164, "iqr": 0.0012699997569143306, "raw_times": [0.049270999852524255, 0.05024100005357468, 0.05084099984742352, 0.05090100012239418, 0.04957100009050919], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05270199994811264, "peak_bytes": 18931712, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S128_W4", "batch": 4, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04698099996858218, "p50": 0.049690999958329485, "p90": 0.04984100019100879, "mean": 0.049111000043922104, "iqr": 0.00083000008999079, "raw_times": [0.04698099996858218, 0.04984100019100879, 0.050031000000672066, 0.049011000101018, 0.049690999958329485], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05244099997980811, "peak_bytes": 18948096, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S512_W2", "batch": 4, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04767199993693794, "p50": 0.049621000016486505, "p90": 0.04999099996894074, "mean": 0.049529399984749034, "iqr": 0.0007789999472151976, "raw_times": [0.04767199993693794, 0.049621000016486505, 0.05115099997965444, 0.04999099996894074, 0.049212000021725544], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05385099984778208, "peak_bytes": 75522048, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S512_W4", "batch": 4, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04878100003224972, "p50": 0.049651000153971836, "p90": 0.050100999942515045, "mean": 0.04970100003447442, "iqr": 0.0005699998837371822, "raw_times": [0.04878100003224972, 0.050440999984857626, 0.049651000153971836, 0.04953100005877786, 0.050100999942515045], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.054691000059392536, "peak_bytes": 75538432, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S2048_W2", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04801099998985592, "p50": 0.049041000011129654, "p90": 0.04916099987894995, "mean": 0.04916500001854729, "iqr": 0.00038999974094622303, "raw_times": [0.04801099998985592, 0.050841000074797194, 0.04916099987894995, 0.04877100013800373, 0.049041000011129654], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05514100007530942, "peak_bytes": 302014464, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S2048_W4", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04843099986828747, "p50": 0.04957099986313551, "p90": 0.04984099996363511, "mean": 0.04942899995512562, "iqr": 0.00046999980440887157, "raw_times": [0.04843099986828747, 0.04937100015922624, 0.049930999921343755, 0.04957099986313551, 0.04984099996363511], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05356199994821509, "peak_bytes": 302030848, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} diff --git a/causal_conv1d/impls/hf_kernels_causal_conv1d.html b/causal_conv1d/impls/hf_kernels_causal_conv1d.html index 35c006a62bb9c82bd81aa6ec9158c6fbf639bad0..f6d2333cbb4e5575dea9faa33e0d7e7b51a750c8 100644 --- a/causal_conv1d/impls/hf_kernels_causal_conv1d.html +++ b/causal_conv1d/impls/hf_kernels_causal_conv1d.html @@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content { ▼ output ▶ uv-logs | -Cell: nv | 0.25s +Cell: nv | 0.24s | Raw @@ -3905,7 +3905,7 @@ Cell: nv | 0.25s
-
Fri Dec 19 19:54:47 2025       
+
Fri Dec 19 22:48:25 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 580.105.08             Driver Version: 580.105.08     CUDA Version: 13.0     |
 +-----------------------------------------+------------------------+----------------------+
@@ -3914,7 +3914,7 @@ Cell: nv | 0.25s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   37C    P0             82W /  350W |       0MiB /  46068MiB |     14%      Default |
+| N/A   28C    P0             90W /  350W |       0MiB /  46068MiB |     10%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -3938,7 +3938,7 @@ Cell: nv | 0.25s
 ▼ output
  ▶ uv-logs
  | 
-Cell: benchmark | 6.16s
+Cell: benchmark | 6.59s
  | 
 
 Raw
@@ -3992,19 +3992,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     144.993us      3625.73%     144.993us     144.993us             1  
-                               hf_kernels_causal_conv1d         6.46%     138.804us        99.34%       2.133ms       2.133ms       0.000us         0.00%       5.407us       5.407us             1  
-                                         CausalConv1dFn         4.85%     104.092us        92.88%       1.994ms     664.706us       0.000us         0.00%       5.407us       1.802us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.17%      25.081us        85.08%       1.827ms     608.915us       3.999us       100.00%       5.407us       1.802us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.999us       100.00%       3.999us       1.333us             3  
-                                Activity Buffer Request        81.62%       1.752ms        81.62%       1.752ms       1.752ms       1.408us        35.21%       1.408us       1.408us             1  
-                                       aten::empty_like         0.88%      18.799us         2.95%      63.281us      21.094us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         2.07%      44.482us         2.07%      44.482us      14.827us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         2.29%      49.201us         2.29%      49.201us      16.400us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.66%      14.170us         0.66%      14.170us      14.170us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     138.175us      3373.41%     138.175us     138.175us             1  
+                               hf_kernels_causal_conv1d         9.76%     215.575us        99.34%       2.195ms       2.195ms       0.000us         0.00%       5.536us       5.536us             1  
+                                         CausalConv1dFn         4.55%     100.523us        89.58%       1.980ms     659.841us       0.000us         0.00%       5.536us       1.845us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.12%      24.839us        82.43%       1.821ms     607.150us       4.096us       100.00%       5.536us       1.845us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.096us       100.00%       4.096us       1.365us             3  
+                                Activity Buffer Request        79.20%       1.750ms        79.20%       1.750ms       1.750ms       1.440us        35.16%       1.440us       1.440us             1  
+                                       aten::empty_like         0.80%      17.780us         2.60%      57.551us      19.184us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.80%      39.771us         1.80%      39.771us      13.257us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         2.11%      46.542us         2.11%      46.542us      15.514us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.66%      14.640us         0.66%      14.640us      14.640us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.147ms
-Self CUDA time total: 3.999us
+Self CPU time total: 2.210ms
+Self CUDA time total: 4.096us
 
 
 
@@ -4014,18 +4014,18 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     120.991us      3231.60%     120.991us     120.991us             1  
-                               hf_kernels_causal_conv1d         3.92%      79.473us        99.73%       2.024ms       2.024ms       0.000us         0.00%       4.992us       4.992us             1  
-                                         CausalConv1dFn         3.49%      70.790us        95.81%       1.944ms     648.019us       0.000us         0.00%       4.992us       1.664us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.22%      24.702us        90.78%       1.842ms     614.002us       3.744us       100.00%       4.992us       1.664us             3  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     116.799us      3119.63%     116.799us     116.799us             1  
+                               hf_kernels_causal_conv1d         4.07%      82.232us        99.70%       2.017ms       2.017ms       0.000us         0.00%       4.992us       4.992us             1  
+                                         CausalConv1dFn         3.50%      70.722us        95.64%       1.934ms     644.801us       0.000us         0.00%       4.992us       1.664us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.19%      23.990us        90.71%       1.835ms     611.576us       3.744us       100.00%       4.992us       1.664us             3  
 void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.744us       100.00%       3.744us       1.248us             3  
-                                Activity Buffer Request        88.06%       1.787ms        88.06%       1.787ms       1.787ms       1.248us        33.33%       1.248us       1.248us             1  
-                                       aten::empty_like         0.40%       8.072us         1.54%      31.262us      10.421us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.14%      23.190us         1.14%      23.190us       7.730us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         1.51%      30.580us         1.51%      30.580us      10.193us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.27%       5.530us         0.27%       5.530us       5.530us       0.000us         0.00%       0.000us       0.000us             1  
+                                Activity Buffer Request        88.04%       1.781ms        88.04%       1.781ms       1.781ms       1.248us        33.33%       1.248us       1.248us             1  
+                                       aten::empty_like         0.35%       7.172us         1.43%      28.951us       9.650us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.08%      21.779us         1.08%      21.779us       7.260us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         1.48%      29.931us         1.48%      29.931us       9.977us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.30%       6.050us         0.30%       6.050us       6.050us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.029ms
+Self CPU time total: 2.023ms
 Self CUDA time total: 3.744us
 
 
@@ -4036,19 +4036,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     119.553us      3165.29%     119.553us     119.553us             1  
-                               hf_kernels_causal_conv1d         5.28%     102.383us        99.71%       1.932ms       1.932ms       0.000us         0.00%       5.026us       5.026us             1  
-                                         CausalConv1dFn         3.60%      69.861us        94.43%       1.830ms     610.001us       0.000us         0.00%       5.026us       1.675us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.29%      25.081us        89.25%       1.730ms     576.561us       3.777us       100.00%       5.026us       1.675us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.777us       100.00%       3.777us       1.259us             3  
-                                Activity Buffer Request        86.29%       1.672ms        86.29%       1.672ms       1.672ms       1.249us        33.07%       1.249us       1.249us             1  
-                                       aten::empty_like         0.46%       8.829us         1.57%      30.461us      10.154us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.12%      21.632us         1.12%      21.632us       7.211us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         1.67%      32.330us         1.67%      32.330us      10.777us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.29%       5.571us         0.29%       5.571us       5.571us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     117.407us      3135.87%     117.407us     117.407us             1  
+                               hf_kernels_causal_conv1d         7.37%     145.913us        99.70%       1.974ms       1.974ms       0.000us         0.00%       4.992us       4.992us             1  
+                                         CausalConv1dFn         3.62%      71.732us        92.33%       1.828ms     609.363us       0.000us         0.00%       4.992us       1.664us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.15%      22.761us        87.20%       1.727ms     575.503us       3.744us       100.00%       4.992us       1.664us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.744us       100.00%       3.744us       1.248us             3  
+                                Activity Buffer Request        84.49%       1.673ms        84.49%       1.673ms       1.673ms       1.248us        33.33%       1.248us       1.248us             1  
+                                       aten::empty_like         0.40%       7.929us         1.51%      29.850us       9.950us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.11%      21.921us         1.11%      21.921us       7.307us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         1.55%      30.741us         1.55%      30.741us      10.247us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.30%       6.010us         0.30%       6.010us       6.010us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.938ms
-Self CUDA time total: 3.777us
+Self CPU time total: 1.980ms
+Self CUDA time total: 3.744us
 
 
 
@@ -4058,18 +4058,18 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     129.951us      3441.50%     129.951us     129.951us             1  
-                               hf_kernels_causal_conv1d         4.54%     106.623us        99.77%       2.341ms       2.341ms       0.000us         0.00%       5.024us       5.024us             1  
-                                         CausalConv1dFn         3.24%      75.911us        95.23%       2.234ms     744.741us       0.000us         0.00%       5.024us       1.675us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         0.97%      22.670us        90.70%       2.128ms     709.344us       3.776us       100.00%       5.024us       1.675us             3  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     125.761us      3330.53%     125.761us     125.761us             1  
+                               hf_kernels_causal_conv1d         5.00%     114.852us        99.75%       2.290ms       2.290ms       0.000us         0.00%       5.024us       5.024us             1  
+                                         CausalConv1dFn         3.16%      72.552us        94.75%       2.175ms     724.966us       0.000us         0.00%       5.024us       1.675us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         0.94%      21.579us        90.35%       2.074ms     691.325us       3.776us       100.00%       5.024us       1.675us             3  
 void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.776us       100.00%       3.776us       1.259us             3  
-                                Activity Buffer Request        79.32%       1.861ms        79.32%       1.861ms       1.861ms       1.248us        33.05%       1.248us       1.248us             1  
-                                       aten::empty_like         0.36%       8.382us         1.29%      30.281us      10.094us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         0.93%      21.899us         0.93%      21.899us       7.300us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        10.41%     244.307us        10.41%     244.307us      81.436us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.23%       5.280us         0.23%       5.280us       5.280us       0.000us         0.00%       0.000us       0.000us             1  
+                                Activity Buffer Request        78.71%       1.807ms        78.71%       1.807ms       1.807ms       1.248us        33.05%       1.248us       1.248us             1  
+                                       aten::empty_like         0.34%       7.750us         1.24%      28.370us       9.457us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         0.90%      20.620us         0.90%      20.620us       6.873us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        10.71%     245.786us        10.71%     245.786us      81.929us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.25%       5.631us         0.25%       5.631us       5.631us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.346ms
+Self CPU time total: 2.295ms
 Self CUDA time total: 3.776us
 
 
@@ -4080,19 +4080,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     129.920us      2724.83%     129.920us     129.920us             1  
-                               hf_kernels_causal_conv1d         4.79%     104.612us        99.77%       2.179ms       2.179ms       0.000us         0.00%       6.368us       6.368us             1  
-                                         CausalConv1dFn         3.53%      77.212us        94.98%       2.075ms     691.584us       0.000us         0.00%       6.368us       2.123us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.05%      22.870us        90.05%       1.967ms     655.679us       4.768us       100.00%       6.368us       2.123us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.768us       100.00%       4.768us       1.589us             3  
-                                Activity Buffer Request        78.82%       1.722ms        78.82%       1.722ms       1.722ms       1.600us        33.56%       1.600us       1.600us             1  
-                                       aten::empty_like         0.38%       8.362us         1.40%      30.502us      10.167us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.01%      22.140us         1.01%      22.140us       7.380us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        10.18%     222.375us        10.18%     222.375us      74.125us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.23%       5.100us         0.23%       5.100us       5.100us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     120.607us      2512.65%     120.607us     120.607us             1  
+                               hf_kernels_causal_conv1d         4.85%     103.512us        99.74%       2.128ms       2.128ms       0.000us         0.00%       6.432us       6.432us             1  
+                                         CausalConv1dFn         3.24%      69.180us        94.89%       2.024ms     674.755us       0.000us         0.00%       6.432us       2.144us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.08%      23.061us        90.26%       1.925ms     641.807us       4.800us       100.00%       6.432us       2.144us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.800us       100.00%       4.800us       1.600us             3  
+                                Activity Buffer Request        78.33%       1.671ms        78.33%       1.671ms       1.671ms       1.632us        34.00%       1.632us       1.632us             1  
+                                       aten::empty_like         0.38%       8.041us         1.39%      29.662us       9.887us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.01%      21.621us         1.01%      21.621us       7.207us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        10.85%     231.515us        10.85%     231.515us      77.172us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.26%       5.440us         0.26%       5.440us       5.440us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.184ms
-Self CUDA time total: 4.768us
+Self CPU time total: 2.133ms
+Self CUDA time total: 4.800us
 
 
 
@@ -4102,19 +4102,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     116.830us      2433.96%     116.830us     116.830us             1  
-                               hf_kernels_causal_conv1d        11.09%      77.122us        99.27%     690.457us     690.457us       0.000us         0.00%       6.432us       6.432us             1  
-                                         CausalConv1dFn        10.04%      69.842us        88.18%     613.335us     204.445us       0.000us         0.00%       6.432us       2.144us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         3.34%      23.210us        74.09%     515.312us     171.771us       4.800us       100.00%       6.432us       2.144us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.800us       100.00%       4.800us       1.600us             3  
-                                Activity Buffer Request        40.97%     284.977us        40.97%     284.977us     284.977us       1.632us        34.00%       1.632us       1.632us             1  
-                                       aten::empty_like         1.03%       7.161us         4.05%      28.181us       9.394us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         3.02%      21.020us         3.02%      21.020us       7.007us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        29.78%     207.125us        29.78%     207.125us      69.042us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.73%       5.090us         0.73%       5.090us       5.090us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     113.085us      2372.25%     113.085us     113.085us             1  
+                               hf_kernels_causal_conv1d        10.68%      79.331us        99.24%     737.226us     737.226us       0.000us         0.00%       6.367us       6.367us             1  
+                                         CausalConv1dFn         8.90%      66.092us        88.56%     657.895us     219.298us       0.000us         0.00%       6.367us       2.122us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         3.16%      23.443us        75.78%     562.993us     187.664us       4.767us       100.00%       6.367us       2.122us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.767us       100.00%       4.767us       1.589us             3  
+                                Activity Buffer Request        41.95%     311.656us        41.95%     311.656us     311.656us       1.600us        33.56%       1.600us       1.600us             1  
+                                       aten::empty_like         0.97%       7.240us         3.88%      28.810us       9.603us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         2.90%      21.570us         2.90%      21.570us       7.190us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        30.68%     227.894us        30.68%     227.894us      75.965us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.76%       5.680us         0.76%       5.680us       5.680us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 695.547us
-Self CUDA time total: 4.800us
+Self CPU time total: 742.906us
+Self CUDA time total: 4.767us
 
 
 
@@ -4124,19 +4124,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     120.990us      1138.94%     120.990us     120.990us             1  
-                               hf_kernels_causal_conv1d         8.25%      76.440us        99.44%     921.732us     921.732us       0.000us         0.00%      14.206us      14.206us             1  
-                                         CausalConv1dFn         7.43%      68.843us        91.20%     845.292us     281.764us       0.000us         0.00%      14.206us       4.735us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         2.57%      23.801us        80.53%     746.458us     248.819us      10.623us       100.00%      14.206us       4.735us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      10.623us       100.00%      10.623us       3.541us             3  
-                                Activity Buffer Request        58.01%     537.643us        58.01%     537.643us     537.643us       3.583us        33.73%       3.583us       3.583us             1  
-                                       aten::empty_like         0.88%       8.201us         3.24%      29.991us       9.997us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         2.35%      21.790us         2.35%      21.790us       7.263us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        19.96%     185.014us        19.96%     185.014us      61.671us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.56%       5.149us         0.56%       5.149us       5.149us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     118.783us      1121.44%     118.783us     118.783us             1  
+                               hf_kernels_causal_conv1d         4.59%     101.592us        99.77%       2.207ms       2.207ms       0.000us         0.00%      14.144us      14.144us             1  
+                                         CausalConv1dFn         3.12%      68.981us        95.17%       2.105ms     701.765us       0.000us         0.00%      14.144us       4.715us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.00%      22.142us        90.78%       2.008ms     669.378us      10.592us       100.00%      14.144us       4.715us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      10.592us       100.00%      10.592us       3.531us             3  
+                                Activity Buffer Request        79.96%       1.769ms        79.96%       1.769ms       1.769ms       3.552us        33.53%       3.552us       3.552us             1  
+                                       aten::empty_like         0.35%       7.660us         1.27%      28.180us       9.393us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         0.93%      20.520us         0.93%      20.520us       6.840us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         9.83%     217.354us         9.83%     217.354us      72.451us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.23%       5.140us         0.23%       5.140us       5.140us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 926.881us
-Self CUDA time total: 10.623us
+Self CPU time total: 2.212ms
+Self CUDA time total: 10.592us
 
 
 
@@ -4146,19 +4146,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     122.338us      1124.43%     122.338us     122.338us             1  
-                               hf_kernels_causal_conv1d         8.49%      80.052us        99.43%     937.274us     937.274us       0.000us         0.00%      14.560us      14.560us             1  
-                                         CausalConv1dFn         7.62%      71.802us        90.94%     857.222us     285.741us       0.000us         0.00%      14.560us       4.853us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         2.57%      24.180us        80.29%     756.859us     252.286us      10.880us       100.00%      14.560us       4.853us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      10.880us       100.00%      10.880us       3.627us             3  
-                                Activity Buffer Request        59.85%     564.194us        59.85%     564.194us     564.194us       3.680us        33.82%       3.680us       3.680us             1  
-                                       aten::empty_like         0.81%       7.670us         3.03%      28.561us       9.520us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         2.22%      20.891us         2.22%      20.891us       6.964us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        17.87%     168.485us        17.87%     168.485us      56.162us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.57%       5.361us         0.57%       5.361us       5.361us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     113.020us      1041.95%     113.020us     113.020us             1  
+                               hf_kernels_causal_conv1d        12.31%      79.151us        99.13%     637.563us     637.563us       0.000us         0.00%      14.495us      14.495us             1  
+                                         CausalConv1dFn        10.43%      67.063us        86.83%     558.412us     186.137us       0.000us         0.00%      14.495us       4.832us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         3.37%      21.690us        72.11%     463.759us     154.586us      10.847us       100.00%      14.495us       4.832us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      10.847us       100.00%      10.847us       3.616us             3  
+                                Activity Buffer Request        35.82%     230.365us        35.82%     230.365us     230.365us       3.648us        33.63%       3.648us       3.648us             1  
+                                       aten::empty_like         1.10%       7.080us         4.29%      27.590us       9.197us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         3.19%      20.510us         3.19%      20.510us       6.837us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        32.92%     211.704us        32.92%     211.704us      70.568us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.87%       5.571us         0.87%       5.571us       5.571us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 942.635us
-Self CUDA time total: 10.880us
+Self CPU time total: 643.134us
+Self CUDA time total: 10.847us
 
 
 
@@ -4168,19 +4168,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     120.254us      1095.61%     120.254us     120.254us             1  
-                               hf_kernels_causal_conv1d        12.25%      73.512us        99.17%     595.034us     595.034us       0.000us         0.00%      14.688us      14.688us             1  
-                                         CausalConv1dFn        11.59%      69.552us        86.91%     521.522us     173.841us       0.000us         0.00%      14.688us       4.896us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.21%      25.240us        70.04%     420.260us     140.087us      10.976us       100.00%      14.688us       4.896us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      10.976us       100.00%      10.976us       3.659us             3  
-                                Activity Buffer Request        38.36%     230.206us        38.36%     230.206us     230.206us       3.712us        33.82%       3.712us       3.712us             1  
-                                       aten::empty_like         1.43%       8.570us         5.28%      31.710us      10.570us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         3.86%      23.140us         3.86%      23.140us       7.713us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        27.47%     164.814us        27.47%     164.814us      54.938us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.83%       5.010us         0.83%       5.010us       5.010us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     124.798us      1140.44%     124.798us     124.798us             1  
+                               hf_kernels_causal_conv1d         8.17%      74.152us        99.44%     901.990us     901.990us       0.000us         0.00%      14.655us      14.655us             1  
+                                         CausalConv1dFn         7.50%      68.001us        91.26%     827.838us     275.946us       0.000us         0.00%      14.655us       4.885us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         2.46%      22.272us        79.87%     724.486us     241.495us      10.943us       100.00%      14.655us       4.885us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      10.943us       100.00%      10.943us       3.648us             3  
+                                Activity Buffer Request        54.33%     492.860us        54.33%     492.860us     492.860us       3.712us        33.92%       3.712us       3.712us             1  
+                                       aten::empty_like         0.85%       7.700us         3.90%      35.351us      11.784us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         3.05%      27.651us         3.05%      27.651us       9.217us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        23.08%     209.354us        23.08%     209.354us      69.785us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.56%       5.090us         0.56%       5.090us       5.090us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 600.044us
-Self CUDA time total: 10.976us
+Self CPU time total: 907.080us
+Self CUDA time total: 10.943us
 
 
 
@@ -4190,19 +4190,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     118.621us      1053.10%     118.621us     118.621us             1  
-                               hf_kernels_causal_conv1d        18.68%      98.702us        99.03%     523.193us     523.193us       0.000us         0.00%      15.040us      15.040us             1  
-                                         CausalConv1dFn        13.73%      72.543us        80.34%     424.491us     141.497us       0.000us         0.00%      15.040us       5.013us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.48%      23.691us        60.95%     322.037us     107.346us      11.264us       100.00%      15.040us       5.013us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      11.264us       100.00%      11.264us       3.755us             3  
-                                Activity Buffer Request        26.02%     137.473us        26.02%     137.473us     137.473us       3.776us        33.52%       3.776us       3.776us             1  
-                                       aten::empty_like         1.47%       7.771us         5.66%      29.911us       9.970us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.19%      22.140us         4.19%      22.140us       7.380us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        30.45%     160.873us        30.45%     160.873us      53.624us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.97%       5.150us         0.97%       5.150us       5.150us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     112.030us       959.16%     112.030us     112.030us             1  
+                               hf_kernels_causal_conv1d        12.54%      75.713us        99.10%     598.293us     598.293us       0.000us         0.00%      15.584us      15.584us             1  
+                                         CausalConv1dFn        11.49%      69.370us        86.56%     522.580us     174.193us       0.000us         0.00%      15.584us       5.195us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         3.73%      22.541us        70.59%     426.160us     142.053us      11.680us       100.00%      15.584us       5.195us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      11.680us       100.00%      11.680us       3.893us             3  
+                                Activity Buffer Request        32.97%     199.044us        32.97%     199.044us     199.044us       3.904us        33.42%       3.904us       3.904us             1  
+                                       aten::empty_like         1.17%       7.060us         4.48%      27.050us       9.017us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         3.31%      19.990us         3.31%      19.990us       6.663us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        33.89%     204.575us        33.89%     204.575us      68.192us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.90%       5.430us         0.90%       5.430us       5.430us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 528.343us
-Self CUDA time total: 11.264us
+Self CPU time total: 603.723us
+Self CUDA time total: 11.680us
 
 
 
@@ -4212,19 +4212,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     128.863us       259.30%     128.863us     128.863us             1  
-                               hf_kernels_causal_conv1d         3.60%      75.972us        99.76%       2.104ms       2.104ms       0.000us         0.00%      82.688us      82.688us             1  
-                                         CausalConv1dFn         3.37%      70.961us        96.16%       2.028ms     675.870us       0.000us         0.00%      82.688us      27.563us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.16%      24.491us        91.36%       1.926ms     642.136us      49.696us       100.00%      82.688us      27.563us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      49.696us       100.00%      49.696us      16.565us             3  
-                                Activity Buffer Request        82.37%       1.737ms        82.37%       1.737ms       1.737ms      32.992us        66.39%      32.992us      32.992us             1  
-                                       aten::empty_like         0.41%       8.720us         1.43%      30.240us      10.080us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.02%      21.520us         1.02%      21.520us       7.173us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         7.83%     165.005us         7.83%     165.005us      55.002us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.24%       5.070us         0.24%       5.070us       5.070us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     122.399us       245.20%     122.399us     122.399us             1  
+                               hf_kernels_causal_conv1d         8.97%      73.082us        99.30%     808.948us     808.948us       0.000us         0.00%      83.103us      83.103us             1  
+                                         CausalConv1dFn         8.32%      67.782us        90.33%     735.866us     245.289us       0.000us         0.00%      83.103us      27.701us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         2.86%      23.300us        78.56%     639.924us     213.308us      49.919us       100.00%      83.103us      27.701us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      49.919us       100.00%      49.919us      16.640us             3  
+                                Activity Buffer Request        50.48%     411.239us        50.48%     411.239us     411.239us      33.184us        66.48%      33.184us      33.184us             1  
+                                       aten::empty_like         0.94%       7.630us         3.46%      28.160us       9.387us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         2.52%      20.530us         2.52%      20.530us       6.843us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        25.21%     205.385us        25.21%     205.385us      68.462us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.70%       5.670us         0.70%       5.670us       5.670us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.109ms
-Self CUDA time total: 49.696us
+Self CPU time total: 814.618us
+Self CUDA time total: 49.919us
 
 
 
@@ -4234,19 +4234,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     122.493us       263.09%     122.493us     122.493us             1  
-                               hf_kernels_causal_conv1d        15.75%      73.853us        98.92%     463.792us     463.792us       0.000us         0.00%      76.190us      76.190us             1  
-                                         CausalConv1dFn        14.80%      69.370us        83.17%     389.939us     129.980us       0.000us         0.00%      76.190us      25.397us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.26%      24.662us        62.03%     290.847us      96.949us      46.559us       100.00%      76.190us      25.397us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      46.559us       100.00%      46.559us      15.520us             3  
-                                Activity Buffer Request        22.78%     106.792us        22.78%     106.792us     106.792us      29.631us        63.64%      29.631us      29.631us             1  
-                                       aten::empty_like         1.83%       8.590us         6.34%      29.722us       9.907us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.51%      21.132us         4.51%      21.132us       7.044us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        34.00%     159.393us        34.00%     159.393us      53.131us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.08%       5.060us         1.08%       5.060us       5.060us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     125.502us       271.23%     125.502us     125.502us             1  
+                               hf_kernels_causal_conv1d        16.33%      98.732us        99.17%     599.704us     599.704us       0.000us         0.00%      75.486us      75.486us             1  
+                                         CausalConv1dFn        10.93%      66.081us        82.84%     500.972us     166.991us       0.000us         0.00%      75.486us      25.162us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         3.99%      24.120us        67.22%     406.509us     135.503us      46.271us       100.00%      75.486us      25.162us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      46.271us       100.00%      46.271us      15.424us             3  
+                                Activity Buffer Request        29.69%     179.524us        29.69%     179.524us     179.524us      29.215us        63.14%      29.215us      29.215us             1  
+                                       aten::empty_like         1.23%       7.440us         4.69%      28.382us       9.461us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         3.46%      20.942us         3.46%      20.942us       6.981us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        33.55%     202.865us        33.55%     202.865us      67.622us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.83%       5.010us         0.83%       5.010us       5.010us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 468.852us
-Self CUDA time total: 46.559us
+Self CPU time total: 604.714us
+Self CUDA time total: 46.271us
 
 
 
@@ -4256,19 +4256,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     121.279us      3132.21%     121.279us     121.279us             1  
-                               hf_kernels_causal_conv1d         4.47%      97.233us        99.75%       2.167ms       2.167ms       0.000us         0.00%       5.088us       5.088us             1  
-                                         CausalConv1dFn         3.35%      72.763us        95.28%       2.070ms     690.087us       0.000us         0.00%       5.088us       1.696us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.09%      23.651us        90.51%       1.967ms     655.553us       3.872us       100.00%       5.088us       1.696us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.872us       100.00%       3.872us       1.291us             3  
-                                Activity Buffer Request        81.94%       1.781ms        81.94%       1.781ms       1.781ms       1.216us        31.40%       1.216us       1.216us             1  
-                                       aten::empty_like         0.44%       9.520us         1.42%      30.840us      10.280us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         0.98%      21.320us         0.98%      21.320us       7.107us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         7.48%     162.473us         7.48%     162.473us      54.158us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.25%       5.420us         0.25%       5.420us       5.420us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     113.631us      2933.93%     113.631us     113.631us             1  
+                               hf_kernels_causal_conv1d         9.16%      84.942us        99.40%     921.770us     921.770us       0.000us         0.00%       5.121us       5.121us             1  
+                                         CausalConv1dFn         7.39%      68.541us        90.24%     836.828us     278.943us       0.000us         0.00%       5.121us       1.707us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         2.33%      21.600us        79.89%     740.886us     246.962us       3.873us       100.00%       5.121us       1.707us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.873us       100.00%       3.873us       1.291us             3  
+                                Activity Buffer Request        58.56%     543.041us        58.56%     543.041us     543.041us       1.248us        32.22%       1.248us       1.248us             1  
+                                       aten::empty_like         0.81%       7.541us         2.95%      27.401us       9.134us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         2.14%      19.860us         2.14%      19.860us       6.620us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        19.01%     176.245us        19.01%     176.245us      58.748us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.60%       5.591us         0.60%       5.591us       5.591us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.173ms
-Self CUDA time total: 3.872us
+Self CPU time total: 927.361us
+Self CUDA time total: 3.873us
 
 
 
@@ -4278,19 +4278,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     115.259us      3001.54%     115.259us     115.259us             1  
-                               hf_kernels_causal_conv1d        21.11%     101.882us        98.95%     477.452us     477.452us       0.000us         0.00%       5.056us       5.056us             1  
-                                         CausalConv1dFn        14.32%      69.113us        77.83%     375.570us     125.190us       0.000us         0.00%       5.056us       1.685us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.08%      24.499us        57.45%     277.196us      92.399us       3.840us       100.00%       5.056us       1.685us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.840us       100.00%       3.840us       1.280us             3  
-                                Activity Buffer Request        19.54%      94.283us        19.54%      94.283us      94.283us       1.216us        31.67%       1.216us       1.216us             1  
-                                       aten::empty_like         1.58%       7.611us         6.06%      29.261us       9.754us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.49%      21.650us         4.49%      21.650us       7.217us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        32.83%     158.414us        32.83%     158.414us      52.805us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.05%       5.080us         1.05%       5.080us       5.080us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     110.365us      2851.07%     110.365us     110.365us             1  
+                               hf_kernels_causal_conv1d        16.99%      89.483us        99.03%     521.432us     521.432us       0.000us         0.00%       5.087us       5.087us             1  
+                                         CausalConv1dFn        12.41%      65.321us        82.04%     431.949us     143.983us       0.000us         0.00%       5.087us       1.696us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.32%      22.760us        64.08%     337.387us     112.462us       3.871us       100.00%       5.087us       1.696us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.871us       100.00%       3.871us       1.290us             3  
+                                Activity Buffer Request        29.67%     156.244us        29.67%     156.244us     156.244us       1.216us        31.41%       1.216us       1.216us             1  
+                                       aten::empty_like         1.31%       6.900us         5.55%      29.241us       9.747us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.24%      22.341us         4.24%      22.341us       7.447us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        30.08%     158.383us        30.08%     158.383us      52.794us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.97%       5.101us         0.97%       5.101us       5.101us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 482.532us
-Self CUDA time total: 3.840us
+Self CPU time total: 526.533us
+Self CUDA time total: 3.871us
 
 
 
@@ -4300,19 +4300,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     122.752us      2996.14%     122.752us     122.752us             1  
-                               hf_kernels_causal_conv1d         4.60%      95.013us        99.74%       2.062ms       2.062ms       0.000us         0.00%       5.473us       5.473us             1  
-                                         CausalConv1dFn         3.49%      72.262us        95.14%       1.967ms     655.743us       0.000us         0.00%       5.473us       1.824us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.30%      26.862us        90.17%       1.864ms     621.455us       4.097us       100.00%       5.473us       1.824us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.097us       100.00%       4.097us       1.366us             3  
-                                Activity Buffer Request        81.16%       1.678ms        81.16%       1.678ms       1.678ms       1.376us        33.59%       1.376us       1.376us             1  
-                                       aten::empty_like         0.40%       8.350us         1.48%      30.601us      10.200us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.08%      22.251us         1.08%      22.251us       7.417us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         7.71%     159.333us         7.71%     159.333us      53.111us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.26%       5.450us         0.26%       5.450us       5.450us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     120.351us      2916.19%     120.351us     120.351us             1  
+                               hf_kernels_causal_conv1d         4.61%      96.291us        99.73%       2.085ms       2.085ms       0.000us         0.00%       5.502us       5.502us             1  
+                                         CausalConv1dFn         3.35%      69.993us        95.12%       1.989ms     662.875us       0.000us         0.00%       5.502us       1.834us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.13%      23.651us        90.41%       1.890ms     630.060us       4.127us       100.00%       5.502us       1.834us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.127us       100.00%       4.127us       1.376us             3  
+                                Activity Buffer Request        81.44%       1.703ms        81.44%       1.703ms       1.703ms       1.375us        33.32%       1.375us       1.375us             1  
+                                       aten::empty_like         0.37%       7.800us         1.36%      28.450us       9.483us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         0.99%      20.650us         0.99%      20.650us       6.883us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         7.84%     163.843us         7.84%     163.843us      54.614us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.27%       5.691us         0.27%       5.691us       5.691us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.068ms
-Self CUDA time total: 4.097us
+Self CPU time total: 2.091ms
+Self CUDA time total: 4.127us
 
 
 
@@ -4322,19 +4322,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     123.482us      3037.69%     123.482us     123.482us             1  
-                               hf_kernels_causal_conv1d        22.91%     113.331us        98.90%     489.311us     489.311us       0.000us         0.00%       5.441us       5.441us             1  
-                                         CausalConv1dFn        13.95%      69.033us        75.99%     375.980us     125.327us       0.000us         0.00%       5.441us       1.814us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.14%      25.431us        55.79%     276.017us      92.006us       4.065us       100.00%       5.441us       1.814us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.065us       100.00%       4.065us       1.355us             3  
-                                Activity Buffer Request        18.49%      91.492us        18.49%      91.492us      91.492us       1.376us        33.85%       1.376us       1.376us             1  
-                                       aten::empty_like         1.55%       7.680us         6.25%      30.930us      10.310us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.70%      23.250us         4.70%      23.250us       7.750us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        32.16%     159.094us        32.16%     159.094us      53.031us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.10%       5.440us         1.10%       5.440us       5.440us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     111.930us      2798.95%     111.930us     111.930us             1  
+                               hf_kernels_causal_conv1d        17.68%      85.272us        98.86%     476.700us     476.700us       0.000us         0.00%       5.343us       5.343us             1  
+                                         CausalConv1dFn        13.80%      66.543us        81.17%     391.428us     130.476us       0.000us         0.00%       5.343us       1.781us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.74%      22.878us        61.63%     297.205us      99.068us       3.999us       100.00%       5.343us       1.781us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.999us       100.00%       3.999us       1.333us             3  
+                                Activity Buffer Request        24.06%     116.003us        24.06%     116.003us     116.003us       1.344us        33.61%       1.344us       1.344us             1  
+                                       aten::empty_like         1.53%       7.380us         5.74%      27.680us       9.227us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.21%      20.300us         4.21%      20.300us       6.767us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        32.83%     158.324us        32.83%     158.324us      52.775us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.14%       5.510us         1.14%       5.510us       5.510us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 494.751us
-Self CUDA time total: 4.065us
+Self CPU time total: 482.210us
+Self CUDA time total: 3.999us
 
 
 
@@ -4344,19 +4344,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     121.726us      2264.25%     121.726us     121.726us             1  
-                               hf_kernels_causal_conv1d         4.87%     106.551us        99.77%       2.181ms       2.181ms       0.000us         0.00%       7.200us       7.200us             1  
-                                         CausalConv1dFn         3.29%      71.843us        94.90%       2.074ms     691.407us       0.000us         0.00%       7.200us       2.400us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.24%      27.032us        90.28%       1.973ms     657.779us       5.376us       100.00%       7.200us       2.400us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       5.376us       100.00%       5.376us       1.792us             3  
-                                Activity Buffer Request        81.68%       1.785ms        81.68%       1.785ms       1.785ms       1.824us        33.93%       1.824us       1.824us             1  
-                                       aten::empty_like         0.35%       7.600us         1.33%      29.041us       9.680us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         0.98%      21.441us         0.98%      21.441us       7.147us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         7.36%     160.923us         7.36%     160.923us      53.641us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.23%       4.940us         0.23%       4.940us       4.940us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     120.287us      2224.65%     120.287us     120.287us             1  
+                               hf_kernels_causal_conv1d         4.63%      99.053us        99.76%       2.136ms       2.136ms       0.000us         0.00%       7.231us       7.231us             1  
+                                         CausalConv1dFn         3.38%      72.351us        95.13%       2.037ms     679.021us       0.000us         0.00%       7.231us       2.410us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         0.97%      20.681us        90.42%       1.936ms     645.411us       5.407us       100.00%       7.231us       2.410us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       5.407us       100.00%       5.407us       1.802us             3  
+                                Activity Buffer Request        82.08%       1.758ms        82.08%       1.758ms       1.758ms       1.824us        33.73%       1.824us       1.824us             1  
+                                       aten::empty_like         0.37%       7.890us         1.33%      28.480us       9.493us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         0.96%      20.590us         0.96%      20.590us       6.863us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         7.38%     157.953us         7.38%     157.953us      52.651us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.24%       5.231us         0.24%       5.231us       5.231us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.186ms
-Self CUDA time total: 5.376us
+Self CPU time total: 2.141ms
+Self CUDA time total: 5.407us
 
 
 
@@ -4366,19 +4366,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     116.382us      2165.25%     116.382us     116.382us             1  
-                               hf_kernels_causal_conv1d        21.05%     101.062us        98.84%     474.631us     474.631us       0.000us         0.00%       7.198us       7.198us             1  
-                                         CausalConv1dFn        14.89%      71.522us        77.79%     373.569us     124.523us       0.000us         0.00%       7.198us       2.399us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.88%      23.451us        56.74%     272.486us      90.829us       5.375us       100.00%       7.198us       2.399us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       5.375us       100.00%       5.375us       1.792us             3  
-                                Activity Buffer Request        18.95%      91.012us        18.95%      91.012us      91.012us       1.823us        33.92%       1.823us       1.823us             1  
-                                       aten::empty_like         1.53%       7.370us         6.16%      29.561us       9.854us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.62%      22.191us         4.62%      22.191us       7.397us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        32.91%     158.023us        32.91%     158.023us      52.674us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.16%       5.571us         1.16%       5.571us       5.571us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     110.429us      2029.94%     110.429us     110.429us             1  
+                               hf_kernels_causal_conv1d        19.18%      90.602us        98.83%     466.890us     466.890us       0.000us         0.00%       7.296us       7.296us             1  
+                                         CausalConv1dFn        14.59%      68.912us        79.65%     376.288us     125.429us       0.000us         0.00%       7.296us       2.432us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.94%      23.331us        59.24%     279.846us      93.282us       5.440us       100.00%       7.296us       2.432us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       5.440us       100.00%       5.440us       1.813us             3  
+                                Activity Buffer Request        20.96%      99.012us        20.96%      99.012us      99.012us       1.856us        34.12%       1.856us       1.856us             1  
+                                       aten::empty_like         1.50%       7.079us         5.83%      27.530us       9.177us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.33%      20.451us         4.33%      20.451us       6.817us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        33.34%     157.503us        33.34%     157.503us      52.501us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.17%       5.520us         1.17%       5.520us       5.520us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 480.202us
-Self CUDA time total: 5.375us
+Self CPU time total: 472.410us
+Self CUDA time total: 5.440us
 
 
 
@@ -4388,19 +4388,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     130.143us       748.98%     130.143us     130.143us             1  
-                               hf_kernels_causal_conv1d         4.89%     101.833us        99.76%       2.079ms       2.079ms       0.000us         0.00%      23.200us      23.200us             1  
-                                         CausalConv1dFn         3.45%      71.962us        94.87%       1.978ms     659.169us       0.000us         0.00%      23.200us       7.733us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.13%      23.451us        89.92%       1.874ms     624.738us      17.376us       100.00%      23.200us       7.733us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      17.376us       100.00%      17.376us       5.792us             3  
-                                Activity Buffer Request        80.93%       1.687ms        80.93%       1.687ms       1.687ms       5.824us        33.52%       5.824us       5.824us             1  
-                                       aten::empty_like         0.43%       8.941us         1.50%      31.331us      10.444us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.07%      22.390us         1.07%      22.390us       7.463us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         7.86%     163.753us         7.86%     163.753us      54.584us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.24%       5.080us         0.24%       5.080us       5.080us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     120.191us       690.48%     120.191us     120.191us             1  
+                               hf_kernels_causal_conv1d         4.71%     101.512us        99.75%       2.151ms       2.151ms       0.000us         0.00%      23.231us      23.231us             1  
+                                         CausalConv1dFn         3.18%      68.482us        95.04%       2.050ms     683.192us       0.000us         0.00%      23.231us       7.744us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.07%      23.020us        90.55%       1.953ms     650.931us      17.407us       100.00%      23.231us       7.744us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      17.407us       100.00%      17.407us       5.802us             3  
+                                Activity Buffer Request        82.08%       1.770ms        82.08%       1.770ms       1.770ms       5.824us        33.46%       5.824us       5.824us             1  
+                                       aten::empty_like         0.34%       7.400us         1.31%      28.300us       9.433us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         0.97%      20.900us         0.97%      20.900us       6.967us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         7.40%     159.664us         7.40%     159.664us      53.221us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.25%       5.480us         0.25%       5.480us       5.480us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.084ms
-Self CUDA time total: 17.376us
+Self CPU time total: 2.157ms
+Self CUDA time total: 17.407us
 
 
 
@@ -4410,19 +4410,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     124.030us       694.61%     124.030us     124.030us             1  
-                               hf_kernels_causal_conv1d        16.72%      76.202us        98.90%     450.691us     450.691us       0.000us         0.00%      23.872us      23.872us             1  
-                                         CausalConv1dFn        15.17%      69.121us        82.18%     374.489us     124.830us       0.000us         0.00%      23.872us       7.957us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.17%      23.551us        60.29%     274.737us      91.579us      17.856us       100.00%      23.872us       7.957us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      17.856us       100.00%      17.856us       5.952us             3  
-                                Activity Buffer Request        20.41%      93.033us        20.41%      93.033us      93.033us       6.016us        33.69%       6.016us       6.016us             1  
-                                       aten::empty_like         1.72%       7.841us         6.72%      30.631us      10.210us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         5.00%      22.790us         5.00%      22.790us       7.597us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        34.70%     158.153us        34.70%     158.153us      52.718us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.10%       5.020us         1.10%       5.020us       5.020us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     115.167us       646.17%     115.167us     115.167us             1  
+                               hf_kernels_causal_conv1d        20.44%      95.313us        98.80%     460.750us     460.750us       0.000us         0.00%      23.806us      23.806us             1  
+                                         CausalConv1dFn        14.34%      66.892us        78.36%     365.437us     121.812us       0.000us         0.00%      23.806us       7.935us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.79%      22.348us        57.90%     270.035us      90.012us      17.823us       100.00%      23.806us       7.935us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      17.823us       100.00%      17.823us       5.941us             3  
+                                Activity Buffer Request        19.71%      91.942us        19.71%      91.942us      91.942us       5.983us        33.57%       5.983us       5.983us             1  
+                                       aten::empty_like         1.51%       7.049us         6.11%      28.510us       9.503us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.60%      21.461us         4.60%      21.461us       7.154us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        33.40%     155.745us        33.40%     155.745us      51.915us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.20%       5.611us         1.20%       5.611us       5.611us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 455.711us
-Self CUDA time total: 17.856us
+Self CPU time total: 466.361us
+Self CUDA time total: 17.823us
 
 
 
@@ -4432,19 +4432,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     129.247us       722.62%     129.247us     129.247us             1  
-                               hf_kernels_causal_conv1d         3.52%      75.733us        99.76%       2.146ms       2.146ms       0.000us         0.00%      23.870us      23.870us             1  
-                                         CausalConv1dFn         3.45%      74.171us        96.24%       2.070ms     690.050us       0.000us         0.00%      23.870us       7.957us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.15%      24.680us        91.39%       1.966ms     655.299us      17.886us       100.00%      23.870us       7.957us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      17.886us       100.00%      17.886us       5.962us             3  
-                                Activity Buffer Request        82.74%       1.780ms        82.74%       1.780ms       1.780ms       5.984us        33.46%       5.984us       5.984us             1  
-                                       aten::empty_like         0.37%       7.882us         1.40%      30.082us      10.027us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.03%      22.200us         1.03%      22.200us       7.400us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         7.51%     161.493us         7.51%     161.493us      53.831us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.24%       5.140us         0.24%       5.140us       5.140us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     128.864us       721.68%     128.864us     128.864us             1  
+                               hf_kernels_causal_conv1d         4.82%     101.713us        99.74%       2.105ms       2.105ms       0.000us         0.00%      23.872us      23.872us             1  
+                                         CausalConv1dFn         3.47%      73.151us        94.92%       2.003ms     667.604us       0.000us         0.00%      23.872us       7.957us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.00%      21.171us        90.08%       1.901ms     633.524us      17.856us       100.00%      23.872us       7.957us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      17.856us       100.00%      17.856us       5.952us             3  
+                                Activity Buffer Request        81.34%       1.716ms        81.34%       1.716ms       1.716ms       6.016us        33.69%       6.016us       6.016us             1  
+                                       aten::empty_like         0.39%       8.269us         1.38%      29.091us       9.697us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         0.99%      20.822us         0.99%      20.822us       6.941us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         7.73%     163.083us         7.73%     163.083us      54.361us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.26%       5.420us         0.26%       5.420us       5.420us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.151ms
-Self CUDA time total: 17.886us
+Self CPU time total: 2.110ms
+Self CUDA time total: 17.856us
 
 
 
@@ -4454,19 +4454,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     127.387us       688.73%     127.387us     127.387us             1  
-                               hf_kernels_causal_conv1d        17.30%      79.433us        98.83%     453.742us     453.742us       0.000us         0.00%      24.704us      24.704us             1  
-                                         CausalConv1dFn        15.18%      69.712us        81.53%     374.309us     124.770us       0.000us         0.00%      24.704us       8.235us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.33%      24.481us        59.78%     274.427us      91.476us      18.496us       100.00%      24.704us       8.235us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      18.496us       100.00%      18.496us       6.165us             3  
-                                Activity Buffer Request        19.95%      91.582us        19.95%      91.582us      91.582us       6.208us        33.56%       6.208us       6.208us             1  
-                                       aten::empty_like         1.61%       7.379us         6.57%      30.170us      10.057us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.96%      22.791us         4.96%      22.791us       7.597us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        34.50%     158.364us        34.50%     158.364us      52.788us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.17%       5.350us         1.17%       5.350us       5.350us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     113.180us       600.52%     113.180us     113.180us             1  
+                               hf_kernels_causal_conv1d        20.91%      97.193us        98.79%     459.100us     459.100us       0.000us         0.00%      25.055us      25.055us             1  
+                                         CausalConv1dFn        14.11%      65.571us        77.87%     361.907us     120.636us       0.000us         0.00%      25.055us       8.352us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.76%      22.110us        57.70%     268.175us      89.392us      18.847us       100.00%      25.055us       8.352us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      18.847us       100.00%      18.847us       6.282us             3  
+                                Activity Buffer Request        19.33%      89.852us        19.33%      89.852us      89.852us       6.208us        32.94%       6.208us       6.208us             1  
+                                       aten::empty_like         1.50%       6.961us         6.06%      28.161us       9.387us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.56%      21.200us         4.56%      21.200us       7.067us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        33.61%     156.213us        33.61%     156.213us      52.071us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.21%       5.640us         1.21%       5.640us       5.640us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 459.092us
-Self CUDA time total: 18.496us
+Self CPU time total: 464.740us
+Self CUDA time total: 18.847us
 
 
 
@@ -4476,19 +4476,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         4.39%      92.153us        99.77%       2.095ms       2.095ms       0.000us         0.00%     161.921us     161.921us             1  
-                                         CausalConv1dFn         3.68%      77.182us        95.38%       2.002ms     667.453us       0.000us         0.00%     161.921us      53.974us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.14%      23.952us        90.24%       1.894ms     631.499us      97.345us       100.00%     161.921us      53.974us             3  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     141.279us       145.13%     141.279us     141.279us             1  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      97.345us       100.00%      97.345us      32.448us             3  
-                                Activity Buffer Request        81.45%       1.710ms        81.45%       1.710ms       1.710ms      64.576us        66.34%      64.576us      64.576us             1  
-                                       aten::empty_like         0.41%       8.600us         1.46%      30.680us      10.227us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.05%      22.080us         1.05%      22.080us       7.360us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         7.65%     160.703us         7.65%     160.703us      53.568us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.23%       4.850us         0.23%       4.850us       4.850us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         4.80%     101.712us        99.74%       2.115ms       2.115ms       0.000us         0.00%     161.662us     161.662us             1  
+                                         CausalConv1dFn         3.27%      69.301us        94.94%       2.014ms     671.195us       0.000us         0.00%     161.662us      53.887us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.10%      23.392us        90.33%       1.916ms     638.601us      97.023us       100.00%     161.662us      53.887us             3  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     130.911us       134.93%     130.911us     130.911us             1  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      97.023us       100.00%      97.023us      32.341us             3  
+                                Activity Buffer Request        81.56%       1.730ms        81.56%       1.730ms       1.730ms      64.639us        66.62%      64.639us      64.639us             1  
+                                       aten::empty_like         0.39%       8.180us         1.34%      28.480us       9.493us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         0.96%      20.300us         0.96%      20.300us       6.767us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         7.66%     162.533us         7.66%     162.533us      54.178us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.26%       5.581us         0.26%       5.581us       5.581us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.099ms
-Self CUDA time total: 97.345us
+Self CPU time total: 2.121ms
+Self CUDA time total: 97.023us
 
 
 
@@ -4498,19 +4498,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d        16.74%      76.723us        98.88%     453.112us     453.112us       0.000us         0.00%     162.555us     162.555us             1  
-                                         CausalConv1dFn        14.95%      68.512us        82.13%     376.389us     125.463us       0.000us         0.00%     162.555us      54.185us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.28%      24.211us        60.48%     277.136us      92.379us      98.269us       100.00%     162.555us      54.185us             3  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     130.235us       132.53%     130.235us     130.235us             1  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      98.269us       100.00%      98.269us      32.756us             3  
-                                Activity Buffer Request        20.17%      92.422us        20.17%      92.422us      92.422us      64.286us        65.42%      64.286us      64.286us             1  
-                                       aten::empty_like         1.77%       8.119us         6.71%      30.741us      10.247us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.94%      22.622us         4.94%      22.622us       7.541us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        35.02%     160.503us        35.02%     160.503us      53.501us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.12%       5.150us         1.12%       5.150us       5.150us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d        19.73%      90.504us        98.74%     452.900us     452.900us       0.000us         0.00%     165.018us     165.018us             1  
+                                         CausalConv1dFn        14.46%      66.339us        79.01%     362.396us     120.799us       0.000us         0.00%     165.018us      55.006us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.97%      22.791us        58.44%     268.046us      89.349us      99.612us       100.00%     165.018us      55.006us             3  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     124.188us       124.67%     124.188us     124.188us             1  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      99.612us       100.00%      99.612us      33.204us             3  
+                                Activity Buffer Request        19.01%      87.182us        19.01%      87.182us      87.182us      65.406us        65.66%      65.406us      65.406us             1  
+                                       aten::empty_like         1.58%       7.240us         6.11%      28.011us       9.337us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.53%      20.771us         4.53%      20.771us       6.924us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        34.46%     158.073us        34.46%     158.073us      52.691us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.26%       5.760us         1.26%       5.760us       5.760us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 458.262us
-Self CUDA time total: 98.269us
+Self CPU time total: 458.660us
+Self CUDA time total: 99.612us
 
 
 impl                     wl                  p50(ms)  ok
@@ -4542,14 +4542,15 @@ hf_kernels_causal_conv1d cuda_B4_D64_S512_W4     0.05  True
 
▶ UV Install Logs
-
Fetching 11 files: 0%| | 0/11 [00:00<?, ?it/s]Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads. - -Fetching 11 files: 45%|████▌ | 5/11 [00:00<00:00, 48.10it/s] -Fetching 11 files: 91%|█████████ | 10/11 [00:01<00:00, 4.70it/s] -Fetching 11 files: 100%|██████████| 11/11 [00:01<00:00, 5.98it/s]
+
Fetching 11 files: 0%| | 0/11 [00:00<?, ?it/s] +Fetching 11 files: 9%|▉ | 1/11 [00:00<00:02, 4.50it/s] +Fetching 11 files: 64%|██████▎ | 7/11 [00:02<00:01, 3.34it/s] +Fetching 11 files: 100%|██████████| 11/11 [00:02<00:00, 5.31it/s]

Artifacts:

causal_conv1d.jsonl diff --git a/causal_conv1d/impls/torch_causal_conv1d.html b/causal_conv1d/impls/torch_causal_conv1d.html index 1f26e9bbb5dcd85ce5688a715cc18ba10c292c3f..8ce2a3cdc26dda61748435c504b3f9ede5025dbe 100644 --- a/causal_conv1d/impls/torch_causal_conv1d.html +++ b/causal_conv1d/impls/torch_causal_conv1d.html @@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content { ▼ output ▶ uv-logs | -Cell: nv | 0.25s +Cell: nv | 0.24s | Raw @@ -3904,7 +3904,7 @@ Cell: nv | 0.25s
-
Fri Dec 19 19:54:47 2025       
+
Fri Dec 19 22:48:25 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 580.105.08             Driver Version: 580.105.08     CUDA Version: 13.0     |
 +-----------------------------------------+------------------------+----------------------+
@@ -3913,7 +3913,7 @@ Cell: nv | 0.25s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   37C    P0             82W /  350W |       0MiB /  46068MiB |     14%      Default |
+| N/A   28C    P0             90W /  350W |       0MiB /  46068MiB |     10%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -3937,7 +3937,7 @@ Cell: nv | 0.25s
 ▼ output
  ▶ uv-logs
  | 
-Cell: benchmark | 7.79s
+Cell: benchmark | 33.94s
  | 
 
 Raw
@@ -3999,29 +3999,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     436.163us      2260.26%     436.163us     436.163us             1  
-                                            torch_eager         9.04%     219.426us        99.40%       2.414ms       2.414ms       0.000us         0.00%      21.633us      21.633us             1  
-                                               aten::to         0.42%      10.088us        82.02%       1.992ms     331.955us       0.000us         0.00%      14.273us       2.379us             6  
-                                         aten::_to_copy         1.72%      41.805us        81.60%       1.982ms     330.274us       0.000us         0.00%      14.273us       2.379us             6  
-                                            aten::copy_         2.55%      61.862us        77.44%       1.881ms     313.438us      11.937us        61.86%      14.273us       2.379us             6  
-                                           aten::conv1d         0.37%       9.100us         6.66%     161.626us      53.875us       0.000us         0.00%       7.360us       2.453us             3  
-                                      aten::convolution         0.65%      15.771us         6.28%     152.526us      50.842us       0.000us         0.00%       7.360us       2.453us             3  
-                                     aten::_convolution         1.37%      33.342us         5.63%     136.755us      45.585us       0.000us         0.00%       7.360us       2.453us             3  
-                                aten::_conv_depthwise2d         1.40%      34.091us         3.35%      81.282us      27.094us       7.360us        38.14%       7.360us       2.453us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.360us        38.14%       7.360us       2.453us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.304us        32.67%       6.304us       2.101us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.633us        29.19%       5.633us       1.878us             3  
-                                Activity Buffer Request        72.13%       1.752ms        72.13%       1.752ms       1.752ms       2.336us        12.11%       2.336us       2.336us             1  
-                                    aten::empty_strided         2.44%      59.211us         2.44%      59.211us       9.868us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         3.74%      90.852us         3.74%      90.852us      10.095us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         1.11%      27.021us         1.43%      34.781us       3.865us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.54%      13.120us         0.54%      13.120us       0.875us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.50%      12.090us         0.50%      12.090us       4.030us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.47%      11.450us         0.47%      11.450us       3.817us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.39%       9.560us         0.46%      11.070us       3.690us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     443.583us      2294.79%     443.583us     443.583us             1  
+                                            torch_eager         5.05%     224.934us        75.09%       3.343ms       3.343ms       0.000us         0.00%      21.698us      21.698us             1  
+                                               aten::to         0.24%      10.861us        65.47%       2.914ms     485.726us       0.000us         0.00%      14.370us       2.395us             6  
+                                         aten::_to_copy         1.00%      44.312us        65.22%       2.903ms     483.916us       0.000us         0.00%      14.370us       2.395us             6  
+                                            aten::copy_         1.44%      63.961us        42.41%       1.888ms     314.646us      12.002us        62.09%      14.370us       2.395us             6  
+                                           aten::conv1d         0.19%       8.510us         3.64%     161.973us      53.991us       0.000us         0.00%       7.328us       2.443us             3  
+                                      aten::convolution         0.39%      17.300us         3.45%     153.463us      51.154us       0.000us         0.00%       7.328us       2.443us             3  
+                                     aten::_convolution         0.75%      33.601us         3.06%     136.163us      45.388us       0.000us         0.00%       7.328us       2.443us             3  
+                                aten::_conv_depthwise2d         0.77%      34.142us         1.86%      82.892us      27.631us       7.328us        37.91%       7.328us       2.443us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.328us        37.91%       7.328us       2.443us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.337us        32.78%       6.337us       2.112us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.665us        29.31%       5.665us       1.888us             3  
+                                Activity Buffer Request        39.41%       1.754ms        39.41%       1.754ms       1.754ms       2.368us        12.25%       2.368us       2.368us             1  
+                                    aten::empty_strided        21.82%     971.311us        21.82%     971.311us     161.885us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         2.12%      94.383us         2.12%      94.383us      10.487us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.60%      26.802us         0.78%      34.811us       3.868us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.29%      12.970us         0.29%      12.970us       0.865us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.27%      12.049us         0.27%      12.049us       4.016us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.27%      12.050us         0.27%      12.050us       4.017us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.16%       7.339us         0.20%       8.900us       2.967us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.428ms
-Self CUDA time total: 19.297us
+Self CPU time total: 4.452ms
+Self CUDA time total: 19.330us
 
 
 
@@ -4031,29 +4031,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     323.230us      1639.85%     323.230us     323.230us             1  
-                                            torch_eager         5.66%     125.716us        99.73%       2.215ms       2.215ms       0.000us         0.00%      21.919us      21.919us             1  
-                                               aten::to         0.28%       6.120us        87.62%       1.946ms     324.308us       0.000us         0.00%      13.951us       2.325us             6  
-                                         aten::_to_copy         1.09%      24.242us        87.34%       1.940ms     323.288us       0.000us         0.00%      13.951us       2.325us             6  
-                                            aten::copy_         2.08%      46.140us        84.89%       1.885ms     314.223us      11.743us        59.58%      13.951us       2.325us             6  
-                                           aten::conv1d         0.26%       5.800us         5.28%     117.333us      39.111us       0.000us         0.00%       7.968us       2.656us             3  
-                                      aten::convolution         0.51%      11.340us         5.02%     111.533us      37.178us       0.000us         0.00%       7.968us       2.656us             3  
-                                     aten::_convolution         1.02%      22.760us         4.51%     100.193us      33.398us       0.000us         0.00%       7.968us       2.656us             3  
-                                aten::_conv_depthwise2d         0.99%      21.901us         2.76%      61.233us      20.411us       7.968us        40.42%       7.968us       2.656us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.968us        40.42%       7.968us       2.656us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.176us        31.33%       6.176us       2.059us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.567us        28.24%       5.567us       1.856us             3  
-                                Activity Buffer Request        80.76%       1.794ms        80.76%       1.794ms       1.794ms       2.208us        11.20%       2.208us       2.208us             1  
-                                    aten::empty_strided         1.36%      30.150us         1.36%      30.150us       5.025us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         2.98%      66.273us         2.98%      66.273us       7.364us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.76%      16.860us         0.98%      21.769us       2.419us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.39%       8.679us         0.39%       8.679us       0.579us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.42%       9.331us         0.42%       9.331us       3.110us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.42%       9.410us         0.42%       9.410us       3.137us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.27%       6.040us         0.33%       7.320us       2.440us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     332.318us      1699.66%     332.318us     332.318us             1  
+                                            torch_eager         5.93%     124.693us        99.74%       2.097ms       2.097ms       0.000us         0.00%      21.696us      21.696us             1  
+                                               aten::to         0.26%       5.560us        86.81%       1.825ms     304.128us       0.000us         0.00%      13.792us       2.299us             6  
+                                         aten::_to_copy         1.04%      21.892us        86.55%       1.819ms     303.202us       0.000us         0.00%      13.792us       2.299us             6  
+                                            aten::copy_         2.25%      47.330us        83.99%       1.765ms     294.235us      11.648us        59.57%      13.792us       2.299us             6  
+                                           aten::conv1d         0.33%       6.849us         5.77%     121.312us      40.437us       0.000us         0.00%       7.904us       2.635us             3  
+                                      aten::convolution         0.42%       8.801us         5.45%     114.463us      38.154us       0.000us         0.00%       7.904us       2.635us             3  
+                                     aten::_convolution         1.06%      22.231us         5.03%     105.662us      35.221us       0.000us         0.00%       7.904us       2.635us             3  
+                                aten::_conv_depthwise2d         1.31%      27.600us         3.25%      68.311us      22.770us       7.904us        40.43%       7.904us       2.635us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.904us        40.43%       7.904us       2.635us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.080us        31.10%       6.080us       2.027us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.568us        28.48%       5.568us       1.856us             3  
+                                Activity Buffer Request        79.40%       1.669ms        79.40%       1.669ms       1.669ms       2.144us        10.97%       2.144us       2.144us             1  
+                                    aten::empty_strided         1.52%      31.910us         1.52%      31.910us       5.318us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         3.37%      70.742us         3.37%      70.742us       7.860us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.77%      16.170us         1.03%      21.551us       2.395us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.43%       9.002us         0.43%       9.002us       0.600us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.47%       9.820us         0.47%       9.820us       3.273us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.44%       9.180us         0.44%       9.180us       3.060us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.25%       5.320us         0.31%       6.610us       2.203us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.221ms
-Self CUDA time total: 19.711us
+Self CPU time total: 2.102ms
+Self CUDA time total: 19.552us
 
 
 
@@ -4063,29 +4063,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     318.337us      1694.72%     318.337us     318.337us             1  
-                                            torch_eager         5.96%     125.666us        99.74%       2.103ms       2.103ms       0.000us         0.00%      20.799us      20.799us             1  
-                                               aten::to         0.29%       6.100us        87.07%       1.836ms     305.953us       0.000us         0.00%      13.822us       2.304us             6  
-                                         aten::_to_copy         1.12%      23.630us        86.78%       1.830ms     304.936us       0.000us         0.00%      13.822us       2.304us             6  
-                                            aten::copy_         2.25%      47.463us        84.23%       1.776ms     295.964us      11.807us        62.86%      13.822us       2.304us             6  
-                                           aten::conv1d         0.29%       6.140us         5.41%     114.033us      38.011us       0.000us         0.00%       6.977us       2.326us             3  
-                                      aten::convolution         0.47%       9.960us         5.12%     107.893us      35.964us       0.000us         0.00%       6.977us       2.326us             3  
-                                     aten::_convolution         1.11%      23.480us         4.65%      97.933us      32.644us       0.000us         0.00%       6.977us       2.326us             3  
-                                aten::_conv_depthwise2d         1.00%      21.010us         2.82%      59.553us      19.851us       6.977us        37.14%       6.977us       2.326us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       6.977us        37.14%       6.977us       2.326us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.047us        32.19%       6.047us       2.016us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.760us        30.66%       5.760us       1.920us             3  
-                                Activity Buffer Request        79.84%       1.683ms        79.84%       1.683ms       1.683ms       2.015us        10.73%       2.015us       2.015us             1  
-                                    aten::empty_strided         1.43%      30.201us         1.43%      30.201us       5.033us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         3.12%      65.751us         3.12%      65.751us       7.306us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.82%      17.281us         1.06%      22.310us       2.479us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.41%       8.589us         0.41%       8.589us       0.573us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.43%       9.162us         0.43%       9.162us       3.054us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.41%       8.730us         0.41%       8.730us       2.910us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.25%       5.310us         0.31%       6.480us       2.160us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     321.886us      1719.48%     321.886us     321.886us             1  
+                                            torch_eager         5.64%     122.557us        99.74%       2.168ms       2.168ms       0.000us         0.00%      20.704us      20.704us             1  
+                                               aten::to         0.28%       6.020us        87.70%       1.906ms     317.672us       0.000us         0.00%      13.728us       2.288us             6  
+                                         aten::_to_copy         1.02%      22.261us        87.43%       1.900ms     316.669us       0.000us         0.00%      13.728us       2.288us             6  
+                                            aten::copy_         2.09%      45.470us        85.00%       1.847ms     307.868us      11.744us        62.74%      13.728us       2.288us             6  
+                                           aten::conv1d         0.30%       6.500us         5.24%     113.822us      37.941us       0.000us         0.00%       6.976us       2.325us             3  
+                                      aten::convolution         0.46%      10.091us         4.94%     107.322us      35.774us       0.000us         0.00%       6.976us       2.325us             3  
+                                     aten::_convolution         0.98%      21.191us         4.47%      97.231us      32.410us       0.000us         0.00%       6.976us       2.325us             3  
+                                aten::_conv_depthwise2d         1.00%      21.810us         2.83%      61.480us      20.493us       6.976us        37.26%       6.976us       2.325us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       6.976us        37.26%       6.976us       2.325us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.016us        32.14%       6.016us       2.005us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.728us        30.60%       5.728us       1.909us             3  
+                                Activity Buffer Request        80.69%       1.754ms        80.69%       1.754ms       1.754ms       1.984us        10.60%       1.984us       1.984us             1  
+                                    aten::empty_strided         1.41%      30.541us         1.41%      30.541us       5.090us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         3.18%      69.051us         3.18%      69.051us       7.672us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.74%      16.087us         0.96%      20.947us       2.327us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.39%       8.541us         0.39%       8.541us       0.569us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.46%       9.970us         0.46%       9.970us       3.323us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.40%       8.750us         0.40%       8.750us       2.917us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.23%       5.051us         0.29%       6.381us       2.127us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.108ms
-Self CUDA time total: 18.784us
+Self CPU time total: 2.173ms
+Self CUDA time total: 18.720us
 
 
 
@@ -4095,29 +4095,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     331.012us      1687.37%     331.012us     331.012us             1  
-                                            torch_eager         5.07%     123.976us        99.78%       2.439ms       2.439ms       0.000us         0.00%      21.729us      21.729us             1  
-                                               aten::to         0.26%       6.330us        88.80%       2.171ms     361.846us       0.000us         0.00%      14.016us       2.336us             6  
-                                         aten::_to_copy         0.96%      23.551us        88.54%       2.165ms     360.791us       0.000us         0.00%      14.016us       2.336us             6  
-                                            aten::copy_         1.90%      46.440us        86.33%       2.111ms     351.776us      11.904us        60.68%      14.016us       2.336us             6  
-                                           aten::conv1d         0.24%       5.859us         4.84%     118.253us      39.418us       0.000us         0.00%       7.713us       2.571us             3  
-                                      aten::convolution         0.37%       9.110us         4.60%     112.394us      37.465us       0.000us         0.00%       7.713us       2.571us             3  
-                                     aten::_convolution         0.99%      24.110us         4.22%     103.284us      34.428us       0.000us         0.00%       7.713us       2.571us             3  
-                                aten::_conv_depthwise2d         0.86%      20.950us         2.48%      60.661us      20.220us       7.713us        39.32%       7.713us       2.571us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.713us        39.32%       7.713us       2.571us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.176us        31.48%       6.176us       2.059us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.728us        29.20%       5.728us       1.909us             3  
-                                Activity Buffer Request        74.74%       1.827ms        74.74%       1.827ms       1.827ms       2.112us        10.77%       2.112us       2.112us             1  
-                                    aten::empty_strided         1.25%      30.541us         1.25%      30.541us       5.090us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        10.54%     257.579us        10.54%     257.579us      28.620us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.76%      18.570us         0.98%      23.841us       2.649us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.35%       8.661us         0.35%       8.661us       0.577us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.39%       9.560us         0.39%       9.560us       3.187us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.39%       9.470us         0.39%       9.470us       3.157us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.30%       7.292us         0.35%       8.562us       2.854us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     323.964us      1656.85%     323.964us     323.964us             1  
+                                            torch_eager         5.53%     125.552us        99.77%       2.267ms       2.267ms       0.000us         0.00%      21.665us      21.665us             1  
+                                               aten::to         0.26%       5.900us        88.05%       2.001ms     333.419us       0.000us         0.00%      13.985us       2.331us             6  
+                                         aten::_to_copy         0.97%      21.980us        87.79%       1.995ms     332.436us       0.000us         0.00%      13.985us       2.331us             6  
+                                            aten::copy_         2.06%      46.813us        85.51%       1.943ms     323.777us      11.873us        60.72%      13.985us       2.331us             6  
+                                           aten::conv1d         0.29%       6.601us         5.07%     115.272us      38.424us       0.000us         0.00%       7.680us       2.560us             3  
+                                      aten::convolution         0.40%       9.150us         4.78%     108.671us      36.224us       0.000us         0.00%       7.680us       2.560us             3  
+                                     aten::_convolution         0.96%      21.770us         4.38%      99.521us      33.174us       0.000us         0.00%       7.680us       2.560us             3  
+                                aten::_conv_depthwise2d         0.97%      21.939us         2.77%      62.961us      20.987us       7.680us        39.28%       7.680us       2.560us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.680us        39.28%       7.680us       2.560us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.177us        31.59%       6.177us       2.059us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.696us        29.13%       5.696us       1.899us             3  
+                                Activity Buffer Request        73.99%       1.681ms        73.99%       1.681ms       1.681ms       2.112us        10.80%       2.112us       2.112us             1  
+                                    aten::empty_strided         1.32%      29.970us         1.32%      29.970us       4.995us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        10.42%     236.845us        10.42%     236.845us      26.316us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.72%      16.321us         0.94%      21.351us       2.372us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.38%       8.680us         0.38%       8.680us       0.579us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.43%       9.870us         0.43%       9.870us       3.290us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.41%       9.231us         0.41%       9.231us       3.077us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.22%       5.110us         0.28%       6.420us       2.140us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.445ms
-Self CUDA time total: 19.617us
+Self CPU time total: 2.272ms
+Self CUDA time total: 19.553us
 
 
 
@@ -4127,29 +4127,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     323.714us      1318.86%     323.714us     323.714us             1  
-                                            torch_eager         5.46%     123.145us        99.75%       2.249ms       2.249ms       0.000us         0.00%      26.849us      26.849us             1  
-                                               aten::to         0.25%       5.731us        88.05%       1.985ms     330.837us       0.000us         0.00%      15.297us       2.550us             6  
-                                         aten::_to_copy         1.06%      23.869us        87.80%       1.979ms     329.882us       0.000us         0.00%      15.297us       2.550us             6  
-                                            aten::copy_         2.13%      47.950us        85.40%       1.925ms     320.895us      12.993us        52.94%      15.297us       2.550us             6  
-                                           aten::conv1d         0.25%       5.619us         5.08%     114.483us      38.161us       0.000us         0.00%      11.552us       3.851us             3  
-                                      aten::convolution         0.41%       9.331us         4.83%     108.864us      36.288us       0.000us         0.00%      11.552us       3.851us             3  
-                                     aten::_convolution         0.98%      22.020us         4.42%      99.533us      33.178us       0.000us         0.00%      11.552us       3.851us             3  
-                                aten::_conv_depthwise2d         0.92%      20.831us         2.69%      60.562us      20.187us      11.552us        47.06%      11.552us       3.851us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      11.552us        47.06%      11.552us       3.851us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.688us        27.25%       6.688us       2.229us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.305us        25.69%       6.305us       2.102us             3  
-                                Activity Buffer Request        74.33%       1.676ms        74.33%       1.676ms       1.676ms       2.304us         9.39%       2.304us       2.304us             1  
-                                    aten::empty_strided         1.33%      30.051us         1.33%      30.051us       5.008us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         9.91%     223.307us         9.91%     223.307us      24.812us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.79%      17.850us         1.04%      23.451us       2.606us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.40%       8.951us         0.40%       8.951us       0.597us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.40%       9.121us         0.40%       9.121us       3.040us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.40%       8.980us         0.40%       8.980us       2.993us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.27%       6.160us         0.32%       7.280us       2.427us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     323.612us      1313.41%     323.612us     323.612us             1  
+                                            torch_eager         5.38%     121.196us        99.77%       2.248ms       2.248ms       0.000us         0.00%      26.943us      26.943us             1  
+                                               aten::to         0.27%       6.030us        88.19%       1.987ms     331.149us       0.000us         0.00%      15.327us       2.555us             6  
+                                         aten::_to_copy         1.03%      23.263us        87.92%       1.981ms     330.144us       0.000us         0.00%      15.327us       2.555us             6  
+                                            aten::copy_         2.13%      48.030us        85.56%       1.928ms     321.255us      13.023us        52.86%      15.327us       2.555us             6  
+                                           aten::conv1d         0.25%       5.560us         5.05%     113.792us      37.931us       0.000us         0.00%      11.616us       3.872us             3  
+                                      aten::convolution         0.43%       9.790us         4.80%     108.232us      36.077us       0.000us         0.00%      11.616us       3.872us             3  
+                                     aten::_convolution         0.97%      21.854us         4.37%      98.442us      32.814us       0.000us         0.00%      11.616us       3.872us             3  
+                                aten::_conv_depthwise2d         0.93%      20.910us         2.73%      61.610us      20.537us      11.616us        47.14%      11.616us       3.872us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      11.616us        47.14%      11.616us       3.872us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.687us        27.14%       6.687us       2.229us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.336us        25.72%       6.336us       2.112us             3  
+                                Activity Buffer Request        75.30%       1.697ms        75.30%       1.697ms       1.697ms       2.304us         9.35%       2.304us       2.304us             1  
+                                    aten::empty_strided         1.33%      30.069us         1.33%      30.069us       5.011us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         9.09%     204.783us         9.09%     204.783us      22.754us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.75%      16.840us         0.97%      21.869us       2.430us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.39%       8.780us         0.39%       8.780us       0.585us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.43%       9.680us         0.43%       9.680us       3.227us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.41%       9.200us         0.41%       9.200us       3.067us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.23%       5.139us         0.29%       6.519us       2.173us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.254ms
-Self CUDA time total: 24.545us
+Self CPU time total: 2.253ms
+Self CUDA time total: 24.639us
 
 
 
@@ -4159,29 +4159,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     332.293us      1275.69%     332.293us     332.293us             1  
-                                            torch_eager         4.94%     125.275us        99.79%       2.531ms       2.531ms       0.000us         0.00%      28.288us      28.288us             1  
-                                               aten::to         0.24%       6.110us        89.17%       2.261ms     376.887us       0.000us         0.00%      15.199us       2.533us             6  
-                                         aten::_to_copy         0.97%      24.638us        88.93%       2.255ms     375.868us       0.000us         0.00%      15.199us       2.533us             6  
-                                            aten::copy_         1.89%      47.902us        86.77%       2.200ms     366.727us      12.959us        49.75%      15.199us       2.533us             6  
-                                           aten::conv1d         0.23%       5.820us         4.59%     116.433us      38.811us       0.000us         0.00%      13.089us       4.363us             3  
-                                      aten::convolution         0.40%      10.180us         4.36%     110.613us      36.871us       0.000us         0.00%      13.089us       4.363us             3  
-                                     aten::_convolution         0.89%      22.520us         3.96%     100.433us      33.478us       0.000us         0.00%      13.089us       4.363us             3  
-                                aten::_conv_depthwise2d         0.83%      21.002us         2.40%      60.753us      20.251us      13.089us        50.25%      13.089us       4.363us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      13.089us        50.25%      13.089us       4.363us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.591us        25.30%       6.591us       2.197us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.368us        24.45%       6.368us       2.123us             3  
-                                Activity Buffer Request        77.38%       1.962ms        77.38%       1.962ms       1.962ms       2.240us         8.60%       2.240us       2.240us             1  
-                                    aten::empty_strided         1.19%      30.211us         1.19%      30.211us       5.035us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         8.30%     210.437us         8.30%     210.437us      23.382us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.72%      18.342us         0.94%      23.791us       2.643us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.36%       9.059us         0.36%       9.059us       0.604us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.39%       9.940us         0.39%       9.940us       3.313us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.37%       9.490us         0.37%       9.490us       3.163us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.28%       7.000us         0.33%       8.470us       2.823us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     323.741us      1242.82%     323.741us     323.741us             1  
+                                            torch_eager         5.26%     120.985us        99.76%       2.294ms       2.294ms       0.000us         0.00%      28.321us      28.321us             1  
+                                               aten::to         0.24%       5.569us        88.38%       2.032ms     338.735us       0.000us         0.00%      15.233us       2.539us             6  
+                                         aten::_to_copy         0.96%      21.983us        88.14%       2.027ms     337.807us       0.000us         0.00%      15.233us       2.539us             6  
+                                            aten::copy_         2.04%      46.990us        85.81%       1.973ms     328.880us      12.961us        49.76%      15.233us       2.539us             6  
+                                           aten::conv1d         0.28%       6.482us         4.99%     114.833us      38.278us       0.000us         0.00%      13.088us       4.363us             3  
+                                      aten::convolution         0.42%       9.729us         4.71%     108.351us      36.117us       0.000us         0.00%      13.088us       4.363us             3  
+                                     aten::_convolution         0.95%      21.778us         4.29%      98.622us      32.874us       0.000us         0.00%      13.088us       4.363us             3  
+                                aten::_conv_depthwise2d         0.90%      20.730us         2.70%      62.081us      20.694us      13.088us        50.24%      13.088us       4.363us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      13.088us        50.24%      13.088us       4.363us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.657us        25.56%       6.657us       2.219us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.304us        24.20%       6.304us       2.101us             3  
+                                Activity Buffer Request        76.06%       1.749ms        76.06%       1.749ms       1.749ms       2.272us         8.72%       2.272us       2.272us             1  
+                                    aten::empty_strided         1.37%      31.579us         1.37%      31.579us       5.263us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         8.65%     198.944us         8.65%     198.944us      22.105us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.72%      16.561us         0.94%      21.621us       2.402us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.38%       8.738us         0.38%       8.738us       0.583us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.43%       9.970us         0.43%       9.970us       3.323us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.42%       9.600us         0.42%       9.600us       3.200us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.23%       5.282us         0.28%       6.541us       2.180us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.536ms
-Self CUDA time total: 26.048us
+Self CPU time total: 2.300ms
+Self CUDA time total: 26.049us
 
 
 
@@ -4191,29 +4191,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     342.974us       895.45%     342.974us     342.974us             1  
-                                            torch_eager         6.28%     155.441us        99.76%       2.468ms       2.468ms       0.000us         0.00%      40.893us      40.893us             1  
-                                           aten::conv1d         0.26%       6.400us         4.96%     122.693us      40.898us       0.000us         0.00%      22.559us       7.520us             3  
-                                      aten::convolution         0.44%      10.880us         4.70%     116.293us      38.764us       0.000us         0.00%      22.559us       7.520us             3  
-                                     aten::_convolution         1.02%      25.290us         4.26%     105.413us      35.138us       0.000us         0.00%      22.559us       7.520us             3  
-                                aten::_conv_depthwise2d         0.93%      22.901us         2.53%      62.512us      20.837us      22.559us        58.90%      22.559us       7.520us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      22.559us        58.90%      22.559us       7.520us             3  
-                                               aten::to         0.27%       6.673us        87.38%       2.161ms     360.194us       0.000us         0.00%      18.334us       3.056us             6  
-                                         aten::_to_copy         1.14%      28.279us        87.11%       2.154ms     359.082us       0.000us         0.00%      18.334us       3.056us             6  
-                                            aten::copy_         2.07%      51.091us        84.63%       2.093ms     348.865us      15.743us        41.10%      18.334us       3.056us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.352us        21.81%       8.352us       2.784us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.391us        19.30%       7.391us       2.464us             3  
-                                Activity Buffer Request        70.52%       1.744ms        70.52%       1.744ms       1.744ms       2.591us         6.76%       2.591us       2.591us             1  
-                                    aten::empty_strided         1.34%      33.023us         1.34%      33.023us       5.504us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        12.90%     319.187us        12.90%     319.187us      35.465us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.77%      18.952us         0.98%      24.192us       2.688us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.37%       9.200us         0.37%       9.200us       0.613us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.37%       9.140us         0.37%       9.140us       3.047us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.37%       9.211us         0.37%       9.211us       3.070us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.27%       6.591us         0.33%       8.191us       2.730us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     353.533us       917.60%     353.533us     353.533us             1  
+                                            torch_eager         6.39%     158.244us        99.77%       2.472ms       2.472ms       0.000us         0.00%      41.120us      41.120us             1  
+                                           aten::conv1d         0.31%       7.740us         5.07%     125.493us      41.831us       0.000us         0.00%      22.624us       7.541us             3  
+                                      aten::convolution         0.49%      12.210us         4.75%     117.753us      39.251us       0.000us         0.00%      22.624us       7.541us             3  
+                                     aten::_convolution         0.97%      24.030us         4.26%     105.543us      35.181us       0.000us         0.00%      22.624us       7.541us             3  
+                                aten::_conv_depthwise2d         0.92%      22.692us         2.61%      64.593us      21.531us      22.624us        58.72%      22.624us       7.541us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      22.624us        58.72%      22.624us       7.541us             3  
+                                               aten::to         0.28%       6.850us        87.14%       2.159ms     359.798us       0.000us         0.00%      18.496us       3.083us             6  
+                                         aten::_to_copy         1.20%      29.847us        86.86%       2.152ms     358.656us       0.000us         0.00%      18.496us       3.083us             6  
+                                            aten::copy_         2.06%      51.053us        84.23%       2.087ms     347.776us      15.904us        41.28%      18.496us       3.083us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.544us        22.18%       8.544us       2.848us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.360us        19.10%       7.360us       2.453us             3  
+                                Activity Buffer Request        69.84%       1.730ms        69.84%       1.730ms       1.730ms       2.592us         6.73%       2.592us       2.592us             1  
+                                    aten::empty_strided         1.43%      35.432us         1.43%      35.432us       5.905us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        13.25%     328.377us        13.25%     328.377us      36.486us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.82%      20.390us         1.01%      25.110us       2.790us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.33%       8.220us         0.33%       8.220us       0.548us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.40%       9.850us         0.40%       9.850us       3.283us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.37%       9.091us         0.37%       9.091us       3.030us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.23%       5.680us         0.28%       7.040us       2.347us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.473ms
-Self CUDA time total: 38.302us
+Self CPU time total: 2.477ms
+Self CUDA time total: 38.528us
 
 
 
@@ -4223,29 +4223,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     320.766us       781.90%     320.766us     320.766us             1  
-                                            torch_eager         5.17%     120.612us        99.77%       2.326ms       2.326ms       0.000us         0.00%      43.616us      43.616us             1  
-                                           aten::conv1d         0.25%       5.870us         4.90%     114.223us      38.074us       0.000us         0.00%      25.312us       8.437us             3  
-                                      aten::convolution         0.44%      10.332us         4.65%     108.353us      36.118us       0.000us         0.00%      25.312us       8.437us             3  
-                                     aten::_convolution         0.97%      22.638us         4.20%      98.021us      32.674us       0.000us         0.00%      25.312us       8.437us             3  
-                                aten::_conv_depthwise2d         0.87%      20.270us         2.54%      59.242us      19.747us      25.312us        61.70%      25.312us       8.437us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      25.312us        61.70%      25.312us       8.437us             3  
-                                               aten::to         0.24%       5.591us        88.58%       2.066ms     344.257us       0.000us         0.00%      18.304us       3.051us             6  
-                                         aten::_to_copy         1.00%      23.430us        88.34%       2.060ms     343.325us       0.000us         0.00%      18.304us       3.051us             6  
-                                            aten::copy_         2.09%      48.750us        86.05%       2.006ms     334.417us      15.712us        38.30%      18.304us       3.051us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.384us        20.44%       8.384us       2.795us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.328us        17.86%       7.328us       2.443us             3  
-                                Activity Buffer Request        74.74%       1.743ms        74.74%       1.743ms       1.743ms       2.592us         6.32%       2.592us       2.592us             1  
-                                    aten::empty_strided         1.29%      30.021us         1.29%      30.021us       5.004us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        10.10%     235.617us        10.10%     235.617us      26.180us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.74%      17.250us         0.95%      22.130us       2.459us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.36%       8.290us         0.36%       8.290us       0.553us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.39%       9.191us         0.39%       9.191us       3.064us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.39%       8.990us         0.39%       8.990us       2.997us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.27%       6.321us         0.32%       7.551us       2.517us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     324.796us       788.65%     324.796us     324.796us             1  
+                                            torch_eager         5.71%     139.250us        99.77%       2.432ms       2.432ms       0.000us         0.00%      43.776us      43.776us             1  
+                                           aten::conv1d         0.23%       5.613us         4.72%     115.084us      38.361us       0.000us         0.00%      25.408us       8.469us             3  
+                                      aten::convolution         0.40%       9.718us         4.49%     109.471us      36.490us       0.000us         0.00%      25.408us       8.469us             3  
+                                     aten::_convolution         0.88%      21.472us         4.09%      99.753us      33.251us       0.000us         0.00%      25.408us       8.469us             3  
+                                aten::_conv_depthwise2d         0.87%      21.279us         2.58%      62.930us      20.977us      25.408us        61.69%      25.408us       8.469us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      25.408us        61.69%      25.408us       8.469us             3  
+                                               aten::to         0.27%       6.521us        88.29%       2.152ms     358.726us       0.000us         0.00%      18.368us       3.061us             6  
+                                         aten::_to_copy         0.94%      22.910us        88.02%       2.146ms     357.639us       0.000us         0.00%      18.368us       3.061us             6  
+                                            aten::copy_         1.92%      46.901us        85.84%       2.093ms     348.779us      15.776us        38.31%      18.368us       3.061us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.384us        20.36%       8.384us       2.795us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.392us        17.95%       7.392us       2.464us             3  
+                                Activity Buffer Request        72.16%       1.759ms        72.16%       1.759ms       1.759ms       2.592us         6.29%       2.592us       2.592us             1  
+                                    aten::empty_strided         1.24%      30.251us         1.24%      30.251us       5.042us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        12.68%     309.107us        12.68%     309.107us      34.345us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.70%      16.981us         0.91%      22.141us       2.460us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.36%       8.721us         0.36%       8.721us       0.581us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.41%      10.030us         0.41%      10.030us       3.343us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.38%       9.261us         0.38%       9.261us       3.087us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.21%       5.070us         0.26%       6.391us       2.130us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.332ms
-Self CUDA time total: 41.024us
+Self CPU time total: 2.438ms
+Self CUDA time total: 41.184us
 
 
 
@@ -4255,29 +4255,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     327.997us       319.51%     327.997us     327.997us             1  
-                                            torch_eager         5.09%     118.124us        99.77%       2.314ms       2.314ms       0.000us         0.00%     108.574us     108.574us             1  
-                                           aten::conv1d         0.23%       5.441us         4.97%     115.193us      38.398us       0.000us         0.00%      70.464us      23.488us             3  
-                                      aten::convolution         0.40%       9.290us         4.73%     109.752us      36.584us       0.000us         0.00%      70.464us      23.488us             3  
-                                     aten::_convolution         1.03%      23.820us         4.33%     100.462us      33.487us       0.000us         0.00%      70.464us      23.488us             3  
-                                aten::_conv_depthwise2d         0.94%      21.771us         2.65%      61.512us      20.504us      70.464us        68.64%      70.464us      23.488us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      70.464us        68.64%      70.464us      23.488us             3  
-                                               aten::to         0.25%       5.729us        88.48%       2.052ms     341.956us       0.000us         0.00%      38.110us       6.352us             6  
-                                         aten::_to_copy         1.03%      23.980us        88.23%       2.046ms     341.002us       0.000us         0.00%      38.110us       6.352us             6  
-                                            aten::copy_         2.02%      46.950us        85.91%       1.992ms     332.012us      32.191us        31.36%      38.110us       6.352us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      17.567us        17.11%      17.567us       5.856us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      14.624us        14.25%      14.624us       4.875us             3  
-                                Activity Buffer Request        76.17%       1.766ms        76.17%       1.766ms       1.766ms       5.919us         5.77%       5.919us       5.919us             1  
-                                    aten::empty_strided         1.29%      29.961us         1.29%      29.961us       4.993us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         8.62%     199.926us         8.62%     199.926us      22.214us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.81%      18.769us         1.04%      24.230us       2.692us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.38%       8.852us         0.38%       8.852us       0.590us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.41%       9.440us         0.41%       9.440us       3.147us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.40%       9.240us         0.40%       9.240us       3.080us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.24%       5.490us         0.29%       6.730us       2.243us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     323.325us       314.08%     323.325us     323.325us             1  
+                                            torch_eager         5.12%     121.144us        99.77%       2.360ms       2.360ms       0.000us         0.00%     108.958us     108.958us             1  
+                                           aten::conv1d         0.23%       5.500us         4.82%     114.012us      38.004us       0.000us         0.00%      70.719us      23.573us             3  
+                                      aten::convolution         0.39%       9.200us         4.59%     108.512us      36.171us       0.000us         0.00%      70.719us      23.573us             3  
+                                     aten::_convolution         0.91%      21.421us         4.20%      99.312us      33.104us       0.000us         0.00%      70.719us      23.573us             3  
+                                aten::_conv_depthwise2d         0.94%      22.190us         2.67%      63.201us      21.067us      70.719us        68.70%      70.719us      23.573us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      70.719us        68.70%      70.719us      23.573us             3  
+                                               aten::to         0.25%       5.870us        88.73%       2.099ms     349.767us       0.000us         0.00%      38.239us       6.373us             6  
+                                         aten::_to_copy         0.93%      21.960us        88.49%       2.093ms     348.789us       0.000us         0.00%      38.239us       6.373us             6  
+                                            aten::copy_         1.93%      45.542us        86.30%       2.041ms     340.189us      32.223us        31.30%      38.239us       6.373us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      17.664us        17.16%      17.664us       5.888us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      14.559us        14.14%      14.559us       4.853us             3  
+                                Activity Buffer Request        73.13%       1.729ms        73.13%       1.729ms       1.729ms       6.016us         5.84%       6.016us       6.016us             1  
+                                    aten::empty_strided         1.25%      29.640us         1.25%      29.640us       4.940us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        12.19%     288.286us        12.19%     288.286us      32.032us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.69%      16.350us         0.91%      21.571us       2.397us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.37%       8.791us         0.37%       8.791us       0.586us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.41%       9.750us         0.41%       9.750us       3.250us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.39%       9.120us         0.39%       9.120us       3.040us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.22%       5.150us         0.27%       6.400us       2.133us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.319ms
-Self CUDA time total: 102.655us
+Self CPU time total: 2.365ms
+Self CUDA time total: 102.942us
 
 
 
@@ -4287,29 +4287,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     324.863us       288.98%     324.863us     324.863us             1  
-                                            torch_eager         5.27%     120.370us        99.77%       2.281ms       2.281ms       0.000us         0.00%     118.337us     118.337us             1  
-                                           aten::conv1d         0.25%       5.800us         4.98%     113.813us      37.938us       0.000us         0.00%      80.416us      26.805us             3  
-                                      aten::convolution         0.39%       8.920us         4.72%     108.013us      36.004us       0.000us         0.00%      80.416us      26.805us             3  
-                                     aten::_convolution         0.94%      21.602us         4.33%      99.093us      33.031us       0.000us         0.00%      80.416us      26.805us             3  
-                                aten::_conv_depthwise2d         0.98%      22.480us         2.74%      62.561us      20.854us      80.416us        71.53%      80.416us      26.805us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      80.416us        71.53%      80.416us      26.805us             3  
-                                               aten::to         0.26%       6.002us        88.33%       2.019ms     336.570us       0.000us         0.00%      37.921us       6.320us             6  
-                                         aten::_to_copy         0.97%      22.148us        88.07%       2.013ms     335.570us       0.000us         0.00%      37.921us       6.320us             6  
-                                            aten::copy_         2.05%      46.852us        85.75%       1.960ms     326.725us      32.001us        28.47%      37.921us       6.320us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      17.472us        15.54%      17.472us       5.824us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      14.529us        12.92%      14.529us       4.843us             3  
-                                Activity Buffer Request        76.07%       1.739ms        76.07%       1.739ms       1.739ms       5.920us         5.27%       5.920us       5.920us             1  
-                                    aten::empty_strided         1.35%      30.921us         1.35%      30.921us       5.153us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         8.60%     196.555us         8.60%     196.555us      21.839us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.81%      18.430us         1.02%      23.240us       2.582us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.36%       8.150us         0.36%       8.150us       0.543us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.40%       9.159us         0.40%       9.159us       3.053us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.39%       8.901us         0.39%       8.901us       2.967us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.24%       5.410us         0.29%       6.670us       2.223us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     319.583us       282.68%     319.583us     319.583us             1  
+                                            torch_eager         5.26%     120.047us        99.77%       2.277ms       2.277ms       0.000us         0.00%     119.039us     119.039us             1  
+                                           aten::conv1d         0.28%       6.329us         5.02%     114.481us      38.160us       0.000us         0.00%      80.863us      26.954us             3  
+                                      aten::convolution         0.40%       9.201us         4.74%     108.152us      36.051us       0.000us         0.00%      80.863us      26.954us             3  
+                                     aten::_convolution         0.92%      21.079us         4.34%      98.951us      32.984us       0.000us         0.00%      80.863us      26.954us             3  
+                                aten::_conv_depthwise2d         0.91%      20.811us         2.77%      63.141us      21.047us      80.863us        71.53%      80.863us      26.954us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      80.863us        71.53%      80.863us      26.954us             3  
+                                               aten::to         0.25%       5.630us        88.39%       2.017ms     336.237us       0.000us         0.00%      38.176us       6.363us             6  
+                                         aten::_to_copy         0.94%      21.550us        88.14%       2.012ms     335.299us       0.000us         0.00%      38.176us       6.363us             6  
+                                            aten::copy_         1.99%      45.392us        85.93%       1.961ms     326.887us      32.192us        28.47%      38.176us       6.363us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      17.600us        15.57%      17.600us       5.867us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      14.592us        12.91%      14.592us       4.864us             3  
+                                Activity Buffer Request        75.53%       1.724ms        75.53%       1.724ms       1.724ms       5.984us         5.29%       5.984us       5.984us             1  
+                                    aten::empty_strided         1.27%      28.920us         1.27%      28.920us       4.820us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         9.42%     215.013us         9.42%     215.013us      23.890us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.71%      16.099us         0.92%      21.000us       2.333us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.37%       8.402us         0.37%       8.402us       0.560us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.44%      10.070us         0.44%      10.070us       3.357us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.40%       9.140us         0.40%       9.140us       3.047us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.23%       5.200us         0.28%       6.490us       2.163us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.286ms
-Self CUDA time total: 112.417us
+Self CPU time total: 2.283ms
+Self CUDA time total: 113.055us
 
 
 
@@ -4319,29 +4319,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         4.92%     118.303us        96.45%       2.317ms       2.317ms       0.000us         0.00%     464.513us     464.513us             1  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     454.847us       106.94%     454.847us     454.847us             1  
-                                           aten::conv1d         0.23%       5.570us         4.84%     116.393us      38.798us       0.000us         0.00%     280.159us      93.386us             3  
-                                      aten::convolution         0.41%       9.781us         4.61%     110.823us      36.941us       0.000us         0.00%     280.159us      93.386us             3  
-                                     aten::_convolution         0.97%      23.190us         4.21%     101.042us      33.681us       0.000us         0.00%     280.159us      93.386us             3  
-                                aten::_conv_depthwise2d         0.92%      22.172us         2.56%      61.572us      20.524us     280.159us        65.87%     280.159us      93.386us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     280.159us        65.87%     280.159us      93.386us             3  
-                                               aten::to         0.25%       6.010us        85.54%       2.055ms     342.552us       0.000us         0.00%     184.354us      30.726us             6  
-                                         aten::_to_copy         1.07%      25.702us        85.29%       2.049ms     341.550us       0.000us         0.00%     184.354us      30.726us             6  
-                                            aten::copy_         1.90%      45.633us        82.97%       1.993ms     332.247us     145.185us        34.13%     184.354us      30.726us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     104.993us        24.68%     104.993us      34.998us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.192us         9.45%      40.192us      13.397us             3  
-                                Activity Buffer Request        73.74%       1.772ms        73.74%       1.772ms       1.772ms      39.169us         9.21%      39.169us      39.169us             1  
-                                    aten::empty_strided         1.25%      30.119us         1.25%      30.119us       5.020us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         8.18%     196.512us         8.18%     196.512us      21.835us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.74%      17.761us         0.97%      23.251us       2.583us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.39%       9.350us         0.39%       9.350us       0.623us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.39%       9.400us         0.39%       9.400us       3.133us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.40%       9.580us         0.40%       9.580us       3.193us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.27%       6.480us         0.34%       8.150us       2.717us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         4.95%     116.473us        96.30%       2.266ms       2.266ms       0.000us         0.00%     463.098us     463.098us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     450.875us       106.36%     450.875us     450.875us             1  
+                                           aten::conv1d         0.27%       6.419us         4.91%     115.592us      38.531us       0.000us         0.00%     279.421us      93.140us             3  
+                                      aten::convolution         0.39%       9.200us         4.64%     109.173us      36.391us       0.000us         0.00%     279.421us      93.140us             3  
+                                     aten::_convolution         0.90%      21.142us         4.25%      99.973us      33.324us       0.000us         0.00%     279.421us      93.140us             3  
+                                aten::_conv_depthwise2d         0.92%      21.621us         2.71%      63.861us      21.287us     279.421us        65.92%     279.421us      93.140us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     279.421us        65.92%     279.421us      93.140us             3  
+                                               aten::to         0.25%       5.769us        85.33%       2.008ms     334.609us       0.000us         0.00%     183.677us      30.613us             6  
+                                         aten::_to_copy         1.28%      30.122us        85.09%       2.002ms     333.647us       0.000us         0.00%     183.677us      30.613us             6  
+                                            aten::copy_         1.97%      46.430us        82.58%       1.943ms     323.810us     144.478us        34.08%     183.677us      30.613us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     104.191us        24.58%     104.191us      34.730us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.287us         9.50%      40.287us      13.429us             3  
+                                Activity Buffer Request        73.06%       1.719ms        73.06%       1.719ms       1.719ms      39.199us         9.25%      39.199us      39.199us             1  
+                                    aten::empty_strided         1.23%      28.900us         1.23%      28.900us       4.817us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         8.51%     200.175us         8.51%     200.175us      22.242us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.69%      16.350us         0.92%      21.710us       2.412us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.38%       8.919us         0.38%       8.919us       0.595us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.42%       9.910us         0.42%       9.910us       3.303us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.41%       9.610us         0.41%       9.610us       3.203us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.22%       5.091us         0.27%       6.340us       2.113us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.403ms
-Self CUDA time total: 425.344us
+Self CPU time total: 2.353ms
+Self CUDA time total: 423.899us
 
 
 
@@ -4351,29 +4351,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         5.04%     119.094us        95.77%       2.263ms       2.263ms       0.000us         0.00%     471.612us     471.612us             1  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     465.149us       106.82%     465.149us     465.149us             1  
-                                           aten::conv1d         0.25%       5.900us         4.82%     113.812us      37.937us       0.000us         0.00%     297.789us      99.263us             3  
-                                      aten::convolution         0.42%       9.940us         4.57%     107.912us      35.971us       0.000us         0.00%     297.789us      99.263us             3  
-                                     aten::_convolution         0.91%      21.591us         4.15%      97.972us      32.657us       0.000us         0.00%     297.789us      99.263us             3  
-                                aten::_conv_depthwise2d         0.88%      20.792us         2.56%      60.392us      20.131us     297.789us        68.39%     297.789us      99.263us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     297.789us        68.39%     297.789us      99.263us             3  
-                                               aten::to         0.25%       5.882us        84.81%       2.004ms     334.050us       0.000us         0.00%     173.823us      28.970us             6  
-                                         aten::_to_copy         1.03%      24.309us        84.56%       1.998ms     333.069us       0.000us         0.00%     173.823us      28.970us             6  
-                                            aten::copy_         1.99%      46.992us        82.30%       1.945ms     324.149us     137.663us        31.61%     173.823us      28.970us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      97.280us        22.34%      97.280us      32.427us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.383us         9.27%      40.383us      13.461us             3  
-                                Activity Buffer Request        72.83%       1.721ms        72.83%       1.721ms       1.721ms      36.160us         8.30%      36.160us      36.160us             1  
-                                    aten::empty_strided         1.24%      29.211us         1.24%      29.211us       4.869us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         8.35%     197.423us         8.35%     197.423us      21.936us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.69%      16.359us         0.90%      21.340us       2.371us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.37%       8.812us         0.37%       8.812us       0.587us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.41%       9.620us         0.41%       9.620us       3.207us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.40%       9.410us         0.40%       9.410us       3.137us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.27%       6.299us         0.32%       7.670us       2.557us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     485.403us       111.78%     485.403us     485.403us             1  
+                                            torch_eager         5.11%     119.851us        95.84%       2.247ms       2.247ms       0.000us         0.00%     469.691us     469.691us             1  
+                                           aten::conv1d         0.25%       5.911us         5.75%     134.703us      44.901us       0.000us         0.00%     298.875us      99.625us             3  
+                                      aten::convolution         0.39%       9.088us         5.49%     128.792us      42.931us       0.000us         0.00%     298.875us      99.625us             3  
+                                     aten::_convolution         0.90%      20.984us         5.11%     119.704us      39.901us       0.000us         0.00%     298.875us      99.625us             3  
+                                aten::_conv_depthwise2d         0.91%      21.360us         3.56%      83.490us      27.830us     298.875us        68.83%     298.875us      99.625us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     298.875us        68.83%     298.875us      99.625us             3  
+                                               aten::to         0.24%       5.731us        83.88%       1.966ms     327.731us       0.000us         0.00%     170.816us      28.469us             6  
+                                         aten::_to_copy         0.95%      22.370us        83.64%       1.961ms     326.776us       0.000us         0.00%     170.816us      28.469us             6  
+                                            aten::copy_         2.00%      46.872us        81.44%       1.909ms     318.188us     135.360us        31.17%     170.816us      28.469us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      95.008us        21.88%      95.008us      31.669us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.352us         9.29%      40.352us      13.451us             3  
+                                Activity Buffer Request        71.79%       1.683ms        71.79%       1.683ms       1.683ms      35.456us         8.17%      35.456us      35.456us             1  
+                                    aten::empty_strided         1.24%      29.152us         1.24%      29.152us       4.859us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         8.71%     204.083us         8.71%     204.083us      22.676us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.70%      16.420us         0.92%      21.581us       2.398us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.38%       8.941us         0.38%       8.941us       0.596us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.19%      27.990us         1.19%      27.990us       9.330us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.41%       9.520us         0.41%       9.520us       3.173us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.23%       5.350us         0.29%       6.870us       2.290us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.363ms
-Self CUDA time total: 435.452us
+Self CPU time total: 2.344ms
+Self CUDA time total: 434.235us
 
 
 
@@ -4383,29 +4383,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     313.695us      1667.17%     313.695us     313.695us             1  
-                                            torch_eager        14.13%     113.614us        99.32%     798.339us     798.339us       0.000us         0.00%      20.800us      20.800us             1  
-                                               aten::to         0.70%       5.659us        68.10%     547.391us      91.232us       0.000us         0.00%      13.664us       2.277us             6  
-                                         aten::_to_copy         3.00%      24.130us        67.40%     541.732us      90.289us       0.000us         0.00%      13.664us       2.277us             6  
-                                            aten::copy_         5.94%      47.780us        60.67%     487.691us      81.282us      11.680us        62.07%      13.664us       2.277us             6  
-                                           aten::conv1d         0.70%       5.600us        14.11%     113.433us      37.811us       0.000us         0.00%       7.136us       2.379us             3  
-                                      aten::convolution         1.23%       9.922us        13.42%     107.833us      35.944us       0.000us         0.00%       7.136us       2.379us             3  
-                                     aten::_convolution         2.79%      22.410us        12.18%      97.911us      32.637us       0.000us         0.00%       7.136us       2.379us             3  
-                                aten::_conv_depthwise2d         2.59%      20.852us         7.49%      60.242us      20.081us       7.136us        37.93%       7.136us       2.379us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.136us        37.93%       7.136us       2.379us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       5.984us        31.80%       5.984us       1.995us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.696us        30.27%       5.696us       1.899us             3  
-                                Activity Buffer Request        32.74%     263.147us        32.74%     263.147us     263.147us       1.984us        10.54%       1.984us       1.984us             1  
-                                    aten::empty_strided         3.72%      29.911us         3.72%      29.911us       4.985us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        24.67%     198.304us        24.67%     198.304us      22.034us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         1.90%      15.298us         2.42%      19.460us       2.162us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.96%       7.694us         0.96%       7.694us       0.513us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.11%       8.950us         1.11%       8.950us       2.983us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.11%       8.900us         1.11%       8.900us       2.967us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.80%       6.399us         0.97%       7.810us       2.603us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     312.893us      1660.09%     312.893us     312.893us             1  
+                                            torch_eager        14.08%     114.574us        99.37%     808.348us     808.348us       0.000us         0.00%      20.800us      20.800us             1  
+                                               aten::to         0.69%       5.578us        68.43%     556.700us      92.783us       0.000us         0.00%      13.600us       2.267us             6  
+                                         aten::_to_copy         2.67%      21.680us        67.75%     551.122us      91.854us       0.000us         0.00%      13.600us       2.267us             6  
+                                            aten::copy_         5.64%      45.852us        61.26%     498.351us      83.058us      11.648us        61.80%      13.600us       2.267us             6  
+                                           aten::conv1d         0.80%       6.521us        13.85%     112.642us      37.547us       0.000us         0.00%       7.200us       2.400us             3  
+                                      aten::convolution         1.21%       9.859us        13.05%     106.121us      35.374us       0.000us         0.00%       7.200us       2.400us             3  
+                                     aten::_convolution         2.61%      21.222us        11.83%      96.262us      32.087us       0.000us         0.00%       7.200us       2.400us             3  
+                                aten::_conv_depthwise2d         2.67%      21.701us         7.50%      61.001us      20.334us       7.200us        38.20%       7.200us       2.400us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.200us        38.20%       7.200us       2.400us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       5.984us        31.75%       5.984us       1.995us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.664us        30.05%       5.664us       1.888us             3  
+                                Activity Buffer Request        33.80%     274.946us        33.80%     274.946us     274.946us       1.952us        10.36%       1.952us       1.952us             1  
+                                    aten::empty_strided         3.82%      31.091us         3.82%      31.091us       5.182us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        24.38%     198.353us        24.38%     198.353us      22.039us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         1.88%      15.291us         2.44%      19.860us       2.207us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.00%       8.160us         1.00%       8.160us       0.544us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.19%       9.720us         1.19%       9.720us       3.240us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.08%       8.780us         1.08%       8.780us       2.927us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.64%       5.210us         0.80%       6.500us       2.167us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 803.779us
-Self CUDA time total: 18.816us
+Self CPU time total: 813.488us
+Self CUDA time total: 18.848us
 
 
 
@@ -4415,29 +4415,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     315.484us      1613.56%     315.484us     315.484us             1  
-                                            torch_eager        14.44%     114.641us        99.35%     788.539us     788.539us       0.000us         0.00%      21.568us      21.568us             1  
-                                               aten::to         0.74%       5.859us        67.65%     536.963us      89.494us       0.000us         0.00%      13.632us       2.272us             6  
-                                         aten::_to_copy         3.05%      24.201us        66.91%     531.104us      88.517us       0.000us         0.00%      13.632us       2.272us             6  
-                                            aten::copy_         6.01%      47.671us        60.17%     477.592us      79.599us      11.616us        59.41%      13.632us       2.272us             6  
-                                           aten::conv1d         0.73%       5.820us        14.10%     111.923us      37.308us       0.000us         0.00%       7.936us       2.645us             3  
-                                      aten::convolution         1.13%       8.991us        13.37%     106.103us      35.368us       0.000us         0.00%       7.936us       2.645us             3  
-                                     aten::_convolution         2.72%      21.550us        12.24%      97.112us      32.371us       0.000us         0.00%       7.936us       2.645us             3  
-                                aten::_conv_depthwise2d         2.66%      21.080us         7.67%      60.892us      20.297us       7.936us        40.59%       7.936us       2.645us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.936us        40.59%       7.936us       2.645us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       5.920us        30.28%       5.920us       1.973us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.696us        29.13%       5.696us       1.899us             3  
-                                Activity Buffer Request        32.17%     255.357us        32.17%     255.357us     255.357us       2.016us        10.31%       2.016us       2.016us             1  
-                                    aten::empty_strided         3.69%      29.311us         3.69%      29.311us       4.885us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        24.68%     195.894us        24.68%     195.894us      21.766us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.03%      16.101us         2.61%      20.721us       2.302us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.02%       8.070us         1.02%       8.070us       0.538us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.14%       9.082us         1.14%       9.082us       3.027us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.18%       9.400us         1.18%       9.400us       3.133us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.66%       5.230us         0.82%       6.480us       2.160us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     316.950us      1626.55%     316.950us     316.950us             1  
+                                            torch_eager        14.00%     113.783us        99.35%     807.628us     807.628us       0.000us         0.00%      21.437us      21.437us             1  
+                                               aten::to         0.71%       5.740us        68.34%     555.502us      92.584us       0.000us         0.00%      13.502us       2.250us             6  
+                                         aten::_to_copy         2.66%      21.649us        67.63%     549.762us      91.627us       0.000us         0.00%      13.502us       2.250us             6  
+                                            aten::copy_         5.63%      45.791us        61.33%     498.512us      83.085us      11.551us        59.28%      13.502us       2.250us             6  
+                                           aten::conv1d         0.67%       5.481us        13.97%     113.553us      37.851us       0.000us         0.00%       7.935us       2.645us             3  
+                                      aten::convolution         1.11%       9.000us        13.29%     108.072us      36.024us       0.000us         0.00%       7.935us       2.645us             3  
+                                     aten::_convolution         2.66%      21.620us        12.19%      99.072us      33.024us       0.000us         0.00%       7.935us       2.645us             3  
+                                aten::_conv_depthwise2d         2.55%      20.761us         7.74%      62.951us      20.984us       7.935us        40.72%       7.935us       2.645us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.935us        40.72%       7.935us       2.645us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       5.887us        30.21%       5.887us       1.962us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.664us        29.07%       5.664us       1.888us             3  
+                                Activity Buffer Request        33.87%     275.306us        33.87%     275.306us     275.306us       1.951us        10.01%       1.951us       1.951us             1  
+                                    aten::empty_strided         3.64%      29.601us         3.64%      29.601us       4.934us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        24.66%     200.485us        24.66%     200.485us      22.276us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         1.94%      15.809us         2.55%      20.761us       2.307us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.03%       8.362us         1.03%       8.362us       0.557us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.19%       9.700us         1.19%       9.700us       3.233us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.16%       9.420us         1.16%       9.420us       3.140us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.65%       5.250us         0.78%       6.380us       2.127us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 793.700us
-Self CUDA time total: 19.552us
+Self CPU time total: 812.878us
+Self CUDA time total: 19.486us
 
 
 
@@ -4447,29 +4447,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     319.227us      1646.26%     319.227us     319.227us             1  
-                                            torch_eager        14.35%     115.202us        99.35%     797.750us     797.750us       0.000us         0.00%      21.567us      21.567us             1  
-                                               aten::to         0.73%       5.879us        67.70%     543.593us      90.599us       0.000us         0.00%      14.367us       2.394us             6  
-                                         aten::_to_copy         2.99%      23.982us        66.96%     537.714us      89.619us       0.000us         0.00%      14.367us       2.394us             6  
-                                            aten::copy_         5.85%      46.982us        60.24%     483.712us      80.619us      12.191us        62.87%      14.367us       2.394us             6  
-                                           aten::conv1d         0.69%       5.560us        14.06%     112.933us      37.644us       0.000us         0.00%       7.200us       2.400us             3  
-                                      aten::convolution         1.13%       9.060us        13.37%     107.373us      35.791us       0.000us         0.00%       7.200us       2.400us             3  
-                                     aten::_convolution         2.73%      21.891us        12.24%      98.313us      32.771us       0.000us         0.00%       7.200us       2.400us             3  
-                                aten::_conv_depthwise2d         2.58%      20.719us         7.42%      59.602us      19.867us       7.200us        37.13%       7.200us       2.400us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.200us        37.13%       7.200us       2.400us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     316.954us      1634.46%     316.954us     316.954us             1  
+                                            torch_eager        14.55%     117.095us        99.34%     799.337us     799.337us       0.000us         0.00%      21.600us      21.600us             1  
+                                               aten::to         0.68%       5.480us        67.45%     542.761us      90.460us       0.000us         0.00%      14.432us       2.405us             6  
+                                         aten::_to_copy         2.68%      21.530us        66.77%     537.281us      89.547us       0.000us         0.00%      14.432us       2.405us             6  
+                                            aten::copy_         5.75%      46.293us        60.38%     485.871us      80.978us      12.224us        63.04%      14.432us       2.405us             6  
+                                           aten::conv1d         0.85%       6.860us        14.15%     113.831us      37.944us       0.000us         0.00%       7.168us       2.389us             3  
+                                      aten::convolution         1.15%       9.280us        13.29%     106.971us      35.657us       0.000us         0.00%       7.168us       2.389us             3  
+                                     aten::_convolution         2.68%      21.591us        12.14%      97.691us      32.564us       0.000us         0.00%       7.168us       2.389us             3  
+                                aten::_conv_depthwise2d         2.56%      20.631us         7.61%      61.261us      20.420us       7.168us        36.96%       7.168us       2.389us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.168us        36.96%       7.168us       2.389us             3  
 void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.272us        32.34%       6.272us       2.091us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.919us        30.52%       5.919us       1.973us             3  
-                                Activity Buffer Request        32.74%     262.897us        32.74%     262.897us     262.897us       2.176us        11.22%       2.176us       2.176us             1  
-                                    aten::empty_strided         3.74%      30.020us         3.74%      30.020us       5.003us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        24.16%     194.015us        24.16%     194.015us      21.557us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.13%      17.072us         2.73%      21.901us       2.433us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.03%       8.259us         1.03%       8.259us       0.551us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.15%       9.210us         1.15%       9.210us       3.070us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.18%       9.491us         1.18%       9.491us       3.164us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.90%       7.200us         1.06%       8.540us       2.847us       0.000us         0.00%       0.000us       0.000us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.952us        30.69%       5.952us       1.984us             3  
+                                Activity Buffer Request        32.96%     265.175us        32.96%     265.175us     265.175us       2.208us        11.39%       2.208us       2.208us             1  
+                                    aten::empty_strided         3.71%      29.880us         3.71%      29.880us       4.980us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        24.44%     196.663us        24.44%     196.663us      21.851us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.01%      16.189us         2.67%      21.519us       2.391us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.09%       8.750us         1.09%       8.750us       0.583us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.23%       9.890us         1.23%       9.890us       3.297us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.05%       8.480us         1.05%       8.480us       2.827us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.65%       5.220us         0.80%       6.399us       2.133us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 802.990us
-Self CUDA time total: 19.391us
+Self CPU time total: 804.637us
+Self CUDA time total: 19.392us
 
 
 
@@ -4479,29 +4479,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     318.585us      1587.92%     318.585us     318.585us             1  
-                                            torch_eager        14.32%     114.402us        99.36%     793.540us     793.540us       0.000us         0.00%      22.271us      22.271us             1  
-                                               aten::to         0.74%       5.889us        67.47%     538.893us      89.816us       0.000us         0.00%      14.368us       2.395us             6  
-                                         aten::_to_copy         2.86%      22.853us        66.74%     533.004us      88.834us       0.000us         0.00%      14.368us       2.395us             6  
-                                            aten::copy_         5.91%      47.181us        60.05%     479.601us      79.933us      12.160us        60.61%      14.368us       2.395us             6  
-                                           aten::conv1d         0.72%       5.730us        14.38%     114.863us      38.288us       0.000us         0.00%       7.903us       2.634us             3  
-                                      aten::convolution         1.15%       9.210us        13.66%     109.133us      36.378us       0.000us         0.00%       7.903us       2.634us             3  
-                                     aten::_convolution         2.90%      23.191us        12.51%      99.923us      33.308us       0.000us         0.00%       7.903us       2.634us             3  
-                                aten::_conv_depthwise2d         2.58%      20.640us         7.49%      59.852us      19.951us       7.903us        39.39%       7.903us       2.634us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.903us        39.39%       7.903us       2.634us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.304us        31.42%       6.304us       2.101us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.856us        29.19%       5.856us       1.952us             3  
-                                Activity Buffer Request        32.57%     260.117us        32.57%     260.117us     260.117us       2.208us        11.01%       2.208us       2.208us             1  
-                                    aten::empty_strided         3.83%      30.550us         3.83%      30.550us       5.092us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        24.14%     192.815us        24.14%     192.815us      21.424us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.21%      17.653us         2.81%      22.412us       2.490us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.03%       8.198us         1.03%       8.198us       0.547us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.14%       9.130us         1.14%       9.130us       3.043us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.20%       9.570us         1.20%       9.570us       3.190us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.77%       6.130us         0.93%       7.460us       2.487us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     317.850us      1579.14%     317.850us     317.850us             1  
+                                            torch_eager        14.46%     115.693us        99.28%     794.297us     794.297us       0.000us         0.00%      22.336us      22.336us             1  
+                                               aten::to         0.72%       5.761us        67.50%     540.021us      90.003us       0.000us         0.00%      14.368us       2.395us             6  
+                                         aten::_to_copy         2.69%      21.489us        66.78%     534.260us      89.043us       0.000us         0.00%      14.368us       2.395us             6  
+                                            aten::copy_         5.80%      46.383us        60.34%     482.741us      80.457us      12.160us        60.41%      14.368us       2.395us             6  
+                                           aten::conv1d         0.71%       5.650us        14.21%     113.673us      37.891us       0.000us         0.00%       7.968us       2.656us             3  
+                                      aten::convolution         1.34%      10.740us        13.50%     108.023us      36.008us       0.000us         0.00%       7.968us       2.656us             3  
+                                     aten::_convolution         2.73%      21.881us        12.16%      97.283us      32.428us       0.000us         0.00%       7.968us       2.656us             3  
+                                aten::_conv_depthwise2d         2.59%      20.703us         7.59%      60.702us      20.234us       7.968us        39.59%       7.968us       2.656us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.968us        39.59%       7.968us       2.656us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.240us        31.00%       6.240us       2.080us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.920us        29.41%       5.920us       1.973us             3  
+                                Activity Buffer Request        32.71%     261.746us        32.71%     261.746us     261.746us       2.208us        10.97%       2.208us       2.208us             1  
+                                    aten::empty_strided         3.75%      30.030us         3.75%      30.030us       5.005us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        24.46%     195.702us        24.46%     195.702us      21.745us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         1.97%      15.730us         2.60%      20.791us       2.310us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.06%       8.481us         1.06%       8.481us       0.565us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.20%       9.570us         1.20%       9.570us       3.190us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.17%       9.339us         1.17%       9.339us       3.113us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.65%       5.220us         0.79%       6.359us       2.120us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 798.670us
-Self CUDA time total: 20.063us
+Self CPU time total: 800.086us
+Self CUDA time total: 20.128us
 
 
 
@@ -4511,29 +4511,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     325.920us       904.53%     325.920us     325.920us             1  
-                                            torch_eager        14.34%     116.794us        99.35%     809.360us     809.360us       0.000us         0.00%      38.624us      38.624us             1  
-                                           aten::conv1d         0.72%       5.870us        14.19%     115.602us      38.534us       0.000us         0.00%      20.192us       6.731us             3  
-                                      aten::convolution         1.16%       9.420us        13.47%     109.732us      36.577us       0.000us         0.00%      20.192us       6.731us             3  
-                                     aten::_convolution         2.80%      22.850us        12.31%     100.312us      33.437us       0.000us         0.00%      20.192us       6.731us             3  
-                                aten::_conv_depthwise2d         2.58%      21.022us         7.49%      61.052us      20.351us      20.192us        56.04%      20.192us       6.731us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      20.192us        56.04%      20.192us       6.731us             3  
-                                               aten::to         0.73%       5.980us        67.60%     550.703us      91.784us       0.000us         0.00%      18.432us       3.072us             6  
-                                         aten::_to_copy         2.92%      23.789us        66.86%     544.723us      90.787us       0.000us         0.00%      18.432us       3.072us             6  
-                                            aten::copy_         5.93%      48.281us        60.23%     490.683us      81.781us      15.840us        43.96%      18.432us       3.072us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.448us        23.45%       8.448us       2.816us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.392us        20.52%       7.392us       2.464us             3  
-                                Activity Buffer Request        30.82%     251.076us        30.82%     251.076us     251.076us       2.592us         7.19%       2.592us       2.592us             1  
-                                    aten::empty_strided         3.71%      30.251us         3.71%      30.251us       5.042us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        26.06%     212.336us        26.06%     212.336us      23.593us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.07%      16.901us         2.67%      21.731us       2.415us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.01%       8.191us         1.01%       8.191us       0.546us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.17%       9.540us         1.17%       9.540us       3.180us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.16%       9.480us         1.16%       9.480us       3.160us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.88%       7.140us         1.02%       8.340us       2.780us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     317.817us       880.50%     317.817us     317.817us             1  
+                                            torch_eager        14.26%     114.771us        99.34%     799.407us     799.407us       0.000us         0.00%      38.718us      38.718us             1  
+                                           aten::conv1d         0.73%       5.881us        14.18%     114.103us      38.034us       0.000us         0.00%      20.223us       6.741us             3  
+                                      aten::convolution         1.23%       9.899us        13.45%     108.222us      36.074us       0.000us         0.00%      20.223us       6.741us             3  
+                                     aten::_convolution         2.58%      20.781us        12.22%      98.323us      32.774us       0.000us         0.00%      20.223us       6.741us             3  
+                                aten::_conv_depthwise2d         2.56%      20.581us         7.80%      62.731us      20.910us      20.223us        56.03%      20.223us       6.741us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      20.223us        56.03%      20.223us       6.741us             3  
+                                               aten::to         0.70%       5.603us        67.84%     545.903us      90.984us       0.000us         0.00%      18.495us       3.083us             6  
+                                         aten::_to_copy         2.70%      21.701us        67.14%     540.300us      90.050us       0.000us         0.00%      18.495us       3.083us             6  
+                                            aten::copy_         5.63%      45.281us        60.69%     488.359us      81.393us      15.872us        43.97%      18.495us       3.083us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.511us        23.58%       8.511us       2.837us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.361us        20.39%       7.361us       2.454us             3  
+                                Activity Buffer Request        33.10%     266.365us        33.10%     266.365us     266.365us       2.623us         7.27%       2.623us       2.623us             1  
+                                    aten::empty_strided         3.76%      30.240us         3.76%      30.240us       5.040us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        24.83%     199.834us        24.83%     199.834us      22.204us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         1.98%      15.970us         2.59%      20.860us       2.318us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.05%       8.430us         1.05%       8.430us       0.562us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.23%       9.879us         1.23%       9.879us       3.293us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.14%       9.150us         1.14%       9.150us       3.050us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.64%       5.171us         0.80%       6.441us       2.147us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 814.681us
-Self CUDA time total: 36.032us
+Self CPU time total: 804.678us
+Self CUDA time total: 36.095us
 
 
 
@@ -4543,29 +4543,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     322.527us       848.40%     322.527us     322.527us             1  
-                                            torch_eager         5.15%     118.831us        99.77%       2.301ms       2.301ms       0.000us         0.00%      40.608us      40.608us             1  
-                                           aten::conv1d         0.25%       5.790us         4.95%     114.162us      38.054us       0.000us         0.00%      22.272us       7.424us             3  
-                                      aten::convolution         0.40%       9.280us         4.70%     108.372us      36.124us       0.000us         0.00%      22.272us       7.424us             3  
-                                     aten::_convolution         0.97%      22.470us         4.30%      99.092us      33.031us       0.000us         0.00%      22.272us       7.424us             3  
-                                aten::_conv_depthwise2d         0.93%      21.479us         2.62%      60.532us      20.177us      22.272us        58.59%      22.272us       7.424us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      22.272us        58.59%      22.272us       7.424us             3  
-                                               aten::to         0.26%       6.081us        88.49%       2.041ms     340.127us       0.000us         0.00%      18.336us       3.056us             6  
-                                         aten::_to_copy         1.04%      23.880us        88.22%       2.035ms     339.113us       0.000us         0.00%      18.336us       3.056us             6  
-                                            aten::copy_         2.05%      47.372us        85.91%       1.981ms     330.228us      15.744us        41.41%      18.336us       3.056us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.416us        22.14%       8.416us       2.805us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.328us        19.28%       7.328us       2.443us             3  
-                                Activity Buffer Request        76.31%       1.760ms        76.31%       1.760ms       1.760ms       2.592us         6.82%       2.592us       2.592us             1  
-                                    aten::empty_strided         1.28%      29.431us         1.28%      29.431us       4.905us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         8.43%     194.465us         8.43%     194.465us      21.607us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.76%      17.531us         0.99%      22.821us       2.536us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.40%       9.119us         0.40%       9.119us       0.608us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.40%       9.181us         0.40%       9.181us       3.060us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.41%       9.511us         0.41%       9.511us       3.170us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.28%       6.360us         0.35%       8.040us       2.680us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     317.472us       827.44%     317.472us     317.472us             1  
+                                            torch_eager         5.39%     120.521us        99.74%       2.231ms       2.231ms       0.000us         0.00%      40.993us      40.993us             1  
+                                           aten::conv1d         0.34%       7.591us         5.06%     113.112us      37.704us       0.000us         0.00%      22.432us       7.477us             3  
+                                      aten::convolution         0.40%       8.869us         4.72%     105.521us      35.174us       0.000us         0.00%      22.432us       7.477us             3  
+                                     aten::_convolution         0.93%      20.812us         4.32%      96.652us      32.217us       0.000us         0.00%      22.432us       7.477us             3  
+                                aten::_conv_depthwise2d         0.93%      20.800us         2.73%      61.101us      20.367us      22.432us        58.47%      22.432us       7.477us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      22.432us        58.47%      22.432us       7.477us             3  
+                                               aten::to         0.25%       5.601us        88.17%       1.973ms     328.759us       0.000us         0.00%      18.561us       3.093us             6  
+                                         aten::_to_copy         0.97%      21.650us        87.92%       1.967ms     327.825us       0.000us         0.00%      18.561us       3.093us             6  
+                                            aten::copy_         2.07%      46.341us        85.60%       1.915ms     319.179us      15.936us        41.53%      18.561us       3.093us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.545us        22.27%       8.545us       2.848us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.391us        19.26%       7.391us       2.464us             3  
+                                Activity Buffer Request        75.81%       1.696ms        75.81%       1.696ms       1.696ms       2.625us         6.84%       2.625us       2.625us             1  
+                                    aten::empty_strided         1.35%      30.230us         1.35%      30.230us       5.038us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         8.67%     194.015us         8.67%     194.015us      21.557us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.71%      15.960us         0.93%      20.871us       2.319us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.38%       8.501us         0.38%       8.501us       0.567us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.44%       9.900us         0.44%       9.900us       3.300us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.41%       9.100us         0.41%       9.100us       3.033us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.23%       5.230us         0.29%       6.510us       2.170us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.306ms
-Self CUDA time total: 38.016us
+Self CPU time total: 2.237ms
+Self CUDA time total: 38.368us
 
 
 
@@ -4575,29 +4575,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     334.615us       522.09%     334.615us     334.615us             1  
-                                            torch_eager        14.27%     115.903us        99.37%     807.269us     807.269us       0.000us         0.00%      68.188us      68.188us             1  
-                                           aten::conv1d         0.69%       5.580us        15.60%     126.753us      42.251us       0.000us         0.00%      41.662us      13.887us             3  
-                                      aten::convolution         1.31%      10.620us        14.92%     121.173us      40.391us       0.000us         0.00%      41.662us      13.887us             3  
-                                     aten::_convolution         3.95%      32.112us        13.61%     110.553us      36.851us       0.000us         0.00%      41.662us      13.887us             3  
-                                aten::_conv_depthwise2d         2.71%      21.990us         7.82%      63.492us      21.164us      41.662us        65.00%      41.662us      13.887us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      41.662us        65.00%      41.662us      13.887us             3  
-                                               aten::to         0.71%       5.728us        66.32%     538.813us      89.802us       0.000us         0.00%      26.526us       4.421us             6  
-                                         aten::_to_copy         2.82%      22.881us        65.62%     533.085us      88.848us       0.000us         0.00%      26.526us       4.421us             6  
-                                            aten::copy_         5.89%      47.814us        59.03%     479.514us      79.919us      22.430us        35.00%      26.526us       4.421us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      12.000us        18.72%      12.000us       4.000us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      10.430us        16.27%      10.430us       3.477us             3  
-                                Activity Buffer Request        31.09%     252.576us        31.09%     252.576us     252.576us       4.096us         6.39%       4.096us       4.096us             1  
-                                    aten::empty_strided         3.78%      30.690us         3.78%      30.690us       5.115us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        24.86%     201.964us        24.86%     201.964us      22.440us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.01%      16.320us         2.66%      21.600us       2.400us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.07%       8.700us         1.07%       8.700us       0.580us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.17%       9.541us         1.17%       9.541us       3.180us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.12%       9.121us         1.12%       9.121us       3.040us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.69%       5.569us         0.85%       6.929us       2.310us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     317.307us       494.31%     317.307us     317.307us             1  
+                                            torch_eager        14.64%     116.902us        99.38%     793.617us     793.617us       0.000us         0.00%      68.288us      68.288us             1  
+                                           aten::conv1d         0.71%       5.701us        14.04%     112.123us      37.374us       0.000us         0.00%      41.760us      13.920us             3  
+                                      aten::convolution         1.27%      10.150us        13.33%     106.422us      35.474us       0.000us         0.00%      41.760us      13.920us             3  
+                                     aten::_convolution         2.67%      21.299us        12.06%      96.272us      32.091us       0.000us         0.00%      41.760us      13.920us             3  
+                                aten::_conv_depthwise2d         2.60%      20.762us         7.62%      60.882us      20.294us      41.760us        65.05%      41.760us      13.920us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      41.760us        65.05%      41.760us      13.920us             3  
+                                               aten::to         0.69%       5.531us        67.64%     540.181us      90.030us       0.000us         0.00%      26.528us       4.421us             6  
+                                         aten::_to_copy         2.67%      21.339us        66.95%     534.650us      89.108us       0.000us         0.00%      26.528us       4.421us             6  
+                                            aten::copy_         5.72%      45.689us        60.26%     481.250us      80.208us      22.432us        34.95%      26.528us       4.421us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      12.000us        18.69%      12.000us       4.000us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      10.432us        16.25%      10.432us       3.477us             3  
+                                Activity Buffer Request        32.36%     258.416us        32.36%     258.416us     258.416us       4.096us         6.38%       4.096us       4.096us             1  
+                                    aten::empty_strided         4.01%      32.061us         4.01%      32.061us       5.344us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        24.87%     198.596us        24.87%     198.596us      22.066us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         1.93%      15.422us         2.51%      20.041us       2.227us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.02%       8.120us         1.02%       8.120us       0.541us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.20%       9.610us         1.20%       9.610us       3.203us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.13%       9.059us         1.13%       9.059us       3.020us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.65%       5.190us         0.81%       6.450us       2.150us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 812.390us
-Self CUDA time total: 64.092us
+Self CPU time total: 798.567us
+Self CUDA time total: 64.192us
 
 
 
@@ -4607,29 +4607,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     333.628us       479.14%     333.628us     333.628us             1  
-                                            torch_eager        20.40%     175.467us        99.34%     854.292us     854.292us       0.000us         0.00%      73.758us      73.758us             1  
-                                           aten::conv1d         0.66%       5.659us        14.42%     124.043us      41.348us       0.000us         0.00%      47.231us      15.744us             3  
-                                      aten::convolution         1.14%       9.780us        13.77%     118.384us      39.461us       0.000us         0.00%      47.231us      15.744us             3  
-                                     aten::_convolution         2.67%      22.962us        12.63%     108.604us      36.201us       0.000us         0.00%      47.231us      15.744us             3  
-                                aten::_conv_depthwise2d         2.49%      21.432us         8.04%      69.183us      23.061us      47.231us        67.83%      47.231us      15.744us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      47.231us        67.83%      47.231us      15.744us             3  
-                                               aten::to         0.69%       5.899us        61.45%     528.412us      88.069us       0.000us         0.00%      26.527us       4.421us             6  
-                                         aten::_to_copy         2.66%      22.900us        60.76%     522.513us      87.086us       0.000us         0.00%      26.527us       4.421us             6  
-                                            aten::copy_         5.49%      47.171us        54.63%     469.832us      78.305us      22.399us        32.17%      26.527us       4.421us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      11.968us        17.19%      11.968us       3.989us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      10.431us        14.98%      10.431us       3.477us             3  
-                                Activity Buffer Request        28.72%     246.977us        28.72%     246.977us     246.977us       4.128us         5.93%       4.128us       4.128us             1  
-                                    aten::empty_strided         3.46%      29.781us         3.46%      29.781us       4.964us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        22.90%     196.964us        22.90%     196.964us      21.885us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         1.98%      17.050us         2.57%      22.090us       2.454us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.01%       8.700us         1.01%       8.700us       0.580us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.95%      16.810us         1.95%      16.810us       5.603us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.12%       9.661us         1.12%       9.661us       3.220us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.74%       6.399us         0.92%       7.879us       2.626us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     319.646us       457.16%     319.646us     319.646us             1  
+                                            torch_eager        19.98%     173.384us        99.30%     861.709us     861.709us       0.000us         0.00%      74.048us      74.048us             1  
+                                           aten::conv1d         0.75%       6.498us        13.19%     114.441us      38.147us       0.000us         0.00%      47.392us      15.797us             3  
+                                      aten::convolution         1.03%       8.971us        12.44%     107.943us      35.981us       0.000us         0.00%      47.392us      15.797us             3  
+                                     aten::_convolution         2.49%      21.580us        11.41%      98.972us      32.991us       0.000us         0.00%      47.392us      15.797us             3  
+                                aten::_conv_depthwise2d         2.51%      21.761us         7.21%      62.552us      20.851us      47.392us        67.78%      47.392us      15.797us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      47.392us        67.78%      47.392us      15.797us             3  
+                                               aten::to         0.64%       5.540us        63.21%     548.512us      91.419us       0.000us         0.00%      26.656us       4.443us             6  
+                                         aten::_to_copy         2.49%      21.601us        62.57%     542.972us      90.495us       0.000us         0.00%      26.656us       4.443us             6  
+                                            aten::copy_         5.20%      45.112us        56.72%     492.221us      82.037us      22.528us        32.22%      26.656us       4.443us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      12.000us        17.16%      12.000us       4.000us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      10.528us        15.06%      10.528us       3.509us             3  
+                                Activity Buffer Request        31.22%     270.946us        31.22%     270.946us     270.946us       4.128us         5.90%       4.128us       4.128us             1  
+                                    aten::empty_strided         3.36%      29.150us         3.36%      29.150us       4.858us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        22.77%     197.623us        22.77%     197.623us      21.958us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         1.83%      15.841us         2.40%      20.801us       2.311us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.99%       8.559us         0.99%       8.559us       0.571us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.14%       9.871us         1.14%       9.871us       3.290us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.09%       9.460us         1.09%       9.460us       3.153us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.62%       5.360us         0.78%       6.760us       2.253us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 859.952us
-Self CUDA time total: 69.630us
+Self CPU time total: 867.779us
+Self CUDA time total: 69.920us
 
 
 
@@ -4639,29 +4639,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     346.806us       186.86%     346.806us     346.806us             1  
-                                            torch_eager        14.53%     117.325us        99.36%     802.090us     802.090us       0.000us         0.00%     195.516us     195.516us             1  
-                                           aten::conv1d         0.70%       5.679us        15.36%     123.982us      41.327us       0.000us         0.00%     133.212us      44.404us             3  
-                                      aten::convolution         1.23%       9.941us        14.65%     118.303us      39.434us       0.000us         0.00%     133.212us      44.404us             3  
-                                     aten::_convolution         2.76%      22.310us        13.42%     108.362us      36.121us       0.000us         0.00%     133.212us      44.404us             3  
-                                aten::_conv_depthwise2d         3.54%      28.551us         8.44%      68.171us      22.724us     133.212us        71.78%     133.212us      44.404us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     133.212us        71.78%     133.212us      44.404us             3  
-                                               aten::to         0.73%       5.930us        66.21%     534.532us      89.089us       0.000us         0.00%      62.304us      10.384us             6  
-                                         aten::_to_copy         2.86%      23.081us        65.48%     528.602us      88.100us       0.000us         0.00%      62.304us      10.384us             6  
-                                            aten::copy_         5.96%      48.102us        58.83%     474.941us      79.157us      52.384us        28.22%      62.304us      10.384us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      29.504us        15.90%      29.504us       9.835us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.880us        12.33%      22.880us       7.627us             3  
-                                Activity Buffer Request        31.11%     251.176us        31.11%     251.176us     251.176us       9.920us         5.34%       9.920us       9.920us             1  
-                                    aten::empty_strided         3.79%      30.580us         3.79%      30.580us       5.097us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        24.34%     196.463us        24.34%     196.463us      21.829us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.06%      16.611us         2.71%      21.892us       2.432us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.12%       9.041us         1.12%       9.041us       0.603us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.13%       9.090us         1.13%       9.090us       3.030us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.21%       9.730us         1.21%       9.730us       3.243us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.95%       7.689us         1.15%       9.269us       3.090us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     330.102us       177.34%     330.102us     330.102us             1  
+                                            torch_eager        14.69%     116.034us        99.32%     784.747us     784.747us       0.000us         0.00%     196.253us     196.253us             1  
+                                           aten::conv1d         0.71%       5.619us        14.22%     112.371us      37.457us       0.000us         0.00%     133.597us      44.532us             3  
+                                      aten::convolution         1.12%       8.850us        13.51%     106.752us      35.584us       0.000us         0.00%     133.597us      44.532us             3  
+                                     aten::_convolution         2.58%      20.371us        12.39%      97.902us      32.634us       0.000us         0.00%     133.597us      44.532us             3  
+                                aten::_conv_depthwise2d         2.74%      21.650us         7.96%      62.881us      20.960us     133.597us        71.77%     133.597us      44.532us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     133.597us        71.77%     133.597us      44.532us             3  
+                                               aten::to         0.71%       5.639us        67.25%     531.321us      88.554us       0.000us         0.00%      62.656us      10.443us             6  
+                                         aten::_to_copy         2.76%      21.830us        66.53%     525.682us      87.614us       0.000us         0.00%      62.656us      10.443us             6  
+                                            aten::copy_         5.82%      46.000us        59.91%     473.350us      78.892us      52.544us        28.23%      62.656us      10.443us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      29.568us        15.88%      29.568us       9.856us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.976us        12.34%      22.976us       7.659us             3  
+                                Activity Buffer Request        31.98%     252.705us        31.98%     252.705us     252.705us      10.112us         5.43%      10.112us      10.112us             1  
+                                    aten::empty_strided         3.86%      30.502us         3.86%      30.502us       5.084us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        24.88%     196.555us        24.88%     196.555us      21.839us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.01%      15.900us         2.64%      20.820us       2.313us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.09%       8.601us         1.09%       8.601us       0.573us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.21%       9.550us         1.21%       9.550us       3.183us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.24%       9.771us         1.24%       9.771us       3.257us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.65%       5.130us         0.82%       6.480us       2.160us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 807.270us
-Self CUDA time total: 185.596us
+Self CPU time total: 790.087us
+Self CUDA time total: 186.141us
 
 
 
@@ -4671,29 +4671,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     339.579us       162.49%     339.579us     339.579us             1  
-                                            torch_eager        14.47%     115.372us        99.34%     792.230us     792.230us       0.000us         0.00%     222.202us     222.202us             1  
-                                           aten::conv1d         0.71%       5.650us        14.39%     114.783us      38.261us       0.000us         0.00%     153.629us      51.210us             3  
-                                      aten::convolution         1.12%       8.960us        13.68%     109.133us      36.378us       0.000us         0.00%     153.629us      51.210us             3  
-                                     aten::_convolution         2.81%      22.409us        12.56%     100.173us      33.391us       0.000us         0.00%     153.629us      51.210us             3  
-                                aten::_conv_depthwise2d         2.65%      21.140us         7.72%      61.571us      20.524us     153.629us        73.51%     153.629us      51.210us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     153.629us        73.51%     153.629us      51.210us             3  
-                                               aten::to         0.74%       5.880us        67.17%     535.703us      89.284us       0.000us         0.00%      68.573us      11.429us             6  
-                                         aten::_to_copy         2.96%      23.602us        66.44%     529.823us      88.304us       0.000us         0.00%      68.573us      11.429us             6  
-                                            aten::copy_         5.90%      47.080us        59.79%     476.780us      79.463us      55.357us        26.49%      68.573us      11.429us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      32.351us        15.48%      32.351us      10.784us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      23.006us        11.01%      23.006us       7.669us             3  
-                                Activity Buffer Request        31.17%     248.536us        31.17%     248.536us     248.536us      13.216us         6.32%      13.216us      13.216us             1  
-                                    aten::empty_strided         3.69%      29.441us         3.69%      29.441us       4.907us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        25.34%     202.095us        25.34%     202.095us      22.455us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.16%      17.223us         2.82%      22.464us       2.496us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.10%       8.783us         1.10%       8.783us       0.586us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.15%       9.160us         1.15%       9.160us       3.053us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.30%      10.340us         1.30%      10.340us       3.447us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.78%       6.220us         0.94%       7.501us       2.500us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     339.514us       161.81%     339.514us     339.514us             1  
+                                            torch_eager        14.66%     116.206us        99.33%     787.167us     787.167us       0.000us         0.00%     223.098us     223.098us             1  
+                                           aten::conv1d         0.68%       5.420us        14.44%     114.442us      38.147us       0.000us         0.00%     153.917us      51.306us             3  
+                                      aten::convolution         1.14%       9.070us        13.76%     109.022us      36.341us       0.000us         0.00%     153.917us      51.306us             3  
+                                     aten::_convolution         2.75%      21.821us        12.61%      99.952us      33.317us       0.000us         0.00%     153.917us      51.306us             3  
+                                aten::_conv_depthwise2d         2.79%      22.080us         7.98%      63.231us      21.077us     153.917us        73.36%     153.917us      51.306us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     153.917us        73.36%     153.917us      51.306us             3  
+                                               aten::to         0.73%       5.771us        67.07%     531.500us      88.583us       0.000us         0.00%      69.181us      11.530us             6  
+                                         aten::_to_copy         2.74%      21.730us        66.34%     525.729us      87.622us       0.000us         0.00%      69.181us      11.530us             6  
+                                            aten::copy_         5.82%      46.089us        59.65%     472.729us      78.788us      55.902us        26.64%      69.181us      11.530us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      32.766us        15.62%      32.766us      10.922us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      23.136us        11.03%      23.136us       7.712us             3  
+                                Activity Buffer Request        31.36%     248.555us        31.36%     248.555us     248.555us      13.279us         6.33%      13.279us      13.279us             1  
+                                    aten::empty_strided         3.95%      31.270us         3.95%      31.270us       5.212us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        25.22%     199.867us        25.22%     199.867us      22.207us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         1.96%      15.509us         2.61%      20.650us       2.294us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.11%       8.780us         1.11%       8.780us       0.585us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.26%       9.960us         1.26%       9.960us       3.320us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.19%       9.409us         1.19%       9.409us       3.136us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.67%       5.290us         0.83%       6.600us       2.200us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 797.480us
-Self CUDA time total: 208.986us
+Self CPU time total: 792.487us
+Self CUDA time total: 209.819us
 
 
 
@@ -4703,29 +4703,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         6.78%     123.653us        52.87%     964.884us     964.884us       0.000us         0.00%       1.509ms       1.509ms             1  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.412ms       100.39%       1.412ms       1.412ms             1  
-                                               aten::to         0.34%       6.170us        38.05%     694.387us     115.731us       0.000us         0.00%     816.698us     136.116us             6  
-                                         aten::_to_copy         1.57%      28.621us        37.71%     688.217us     114.703us       0.000us         0.00%     816.698us     136.116us             6  
-                                            aten::copy_         2.79%      50.981us        25.74%     469.702us      78.284us     713.755us        50.75%     816.698us     136.116us             6  
-                                           aten::conv1d         0.31%       5.748us         6.52%     119.002us      39.667us       0.000us         0.00%     692.571us     230.857us             3  
-                                      aten::convolution         0.58%      10.581us         6.21%     113.254us      37.751us       0.000us         0.00%     692.571us     230.857us             3  
-                                     aten::_convolution         1.23%      22.532us         5.63%     102.673us      34.224us       0.000us         0.00%     692.571us     230.857us             3  
-                                aten::_conv_depthwise2d         1.21%      22.101us         3.46%      63.221us      21.074us     692.571us        49.25%     692.571us     230.857us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     692.571us        49.25%     692.571us     230.857us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     406.878us        28.93%     406.878us     135.626us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     306.877us        21.82%     306.877us     102.292us             3  
-                                Activity Buffer Request        13.13%     239.566us        13.13%     239.566us     239.566us     102.943us         7.32%     102.943us     102.943us             1  
-                                    aten::empty_strided         2.01%      36.730us        10.40%     189.894us      31.649us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        11.02%     201.035us        11.02%     201.035us      22.337us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.99%      18.011us         1.28%      23.402us       2.600us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.50%       9.042us         0.50%       9.042us       0.603us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.52%       9.410us         0.52%       9.410us       3.137us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.54%       9.830us         0.54%       9.830us       3.277us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.39%       7.200us         0.47%       8.510us       2.837us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         6.64%     122.391us        53.17%     980.021us     980.021us       0.000us         0.00%       1.512ms       1.512ms             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.413ms       100.43%       1.413ms       1.413ms             1  
+                                               aten::to         0.33%       6.111us        38.61%     711.666us     118.611us       0.000us         0.00%     818.734us     136.456us             6  
+                                         aten::_to_copy         1.46%      26.859us        38.28%     705.555us     117.592us       0.000us         0.00%     818.734us     136.456us             6  
+                                            aten::copy_         2.64%      48.673us        26.26%     483.993us      80.665us     713.617us        50.73%     818.734us     136.456us             6  
+                                           aten::conv1d         0.38%       6.930us         6.48%     119.403us      39.801us       0.000us         0.00%     692.947us     230.982us             3  
+                                      aten::convolution         0.60%      11.081us         6.10%     112.473us      37.491us       0.000us         0.00%     692.947us     230.982us             3  
+                                     aten::_convolution         1.18%      21.812us         5.50%     101.392us      33.797us       0.000us         0.00%     692.947us     230.982us             3  
+                                aten::_conv_depthwise2d         1.17%      21.580us         3.49%      64.271us      21.424us     692.947us        49.27%     692.947us     230.982us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     692.947us        49.27%     692.947us     230.982us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     406.808us        28.92%     406.808us     135.603us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     306.809us        21.81%     306.809us     102.270us             3  
+                                Activity Buffer Request        13.93%     256.826us        13.93%     256.826us     256.826us     105.117us         7.47%     105.117us     105.117us             1  
+                                    aten::empty_strided         2.09%      38.520us        10.56%     194.703us      32.450us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        10.93%     201.405us        10.93%     201.405us      22.378us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.92%      16.870us         1.21%      22.379us       2.487us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.50%       9.219us         0.50%       9.219us       0.615us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.55%      10.080us         0.55%      10.080us       3.360us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.53%       9.700us         0.53%       9.700us       3.233us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.29%       5.350us         0.36%       6.580us       2.193us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.825ms
-Self CUDA time total: 1.406ms
+Self CPU time total: 1.843ms
+Self CUDA time total: 1.407ms
 
 
 
@@ -4735,29 +4735,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         3.67%     125.416us        69.04%       2.359ms       2.359ms       0.000us         0.00%       1.503ms       1.503ms             1  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.434ms       100.40%       1.434ms       1.434ms             1  
-                                               aten::to         0.17%       5.940us        61.15%       2.089ms     348.142us       0.000us         0.00%     760.702us     126.784us             6  
-                                         aten::_to_copy         0.72%      24.663us        60.97%       2.083ms     347.152us       0.000us         0.00%     760.702us     126.784us             6  
-                                            aten::copy_         1.35%      46.170us        59.36%       2.028ms     337.971us     686.110us        48.03%     760.702us     126.784us             6  
-                                           aten::conv1d         0.17%       5.960us         3.43%     117.002us      39.001us       0.000us         0.00%     742.490us     247.497us             3  
-                                      aten::convolution         0.28%       9.489us         3.25%     111.042us      37.014us       0.000us         0.00%     742.490us     247.497us             3  
-                                     aten::_convolution         0.69%      23.709us         2.97%     101.553us      33.851us       0.000us         0.00%     742.490us     247.497us             3  
-                                aten::_conv_depthwise2d         0.61%      20.921us         1.79%      61.223us      20.408us     742.490us        51.97%     742.490us     247.497us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     742.490us        51.97%     742.490us     247.497us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     400.959us        28.07%     400.959us     133.653us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     285.151us        19.96%     285.151us      95.050us             3  
-                                Activity Buffer Request        52.88%       1.806ms        52.88%       1.806ms       1.806ms      74.592us         5.22%      74.592us      74.592us             1  
-                                    aten::empty_strided         0.89%      30.420us         0.89%      30.420us       5.070us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         5.75%     196.514us         5.75%     196.514us      21.835us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.52%      17.690us         0.68%      23.179us       2.575us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.27%       9.290us         0.27%       9.290us       0.619us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.28%       9.521us         0.28%       9.521us       3.174us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.28%       9.690us         0.28%       9.690us       3.230us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.19%       6.410us         0.23%       7.991us       2.664us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         3.65%     121.382us        68.13%       2.266ms       2.266ms       0.000us         0.00%       1.497ms       1.497ms             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.428ms       100.41%       1.428ms       1.428ms             1  
+                                               aten::to         0.17%       5.810us        60.23%       2.004ms     333.937us       0.000us         0.00%     757.239us     126.207us             6  
+                                         aten::_to_copy         0.66%      21.893us        60.05%       1.998ms     332.969us       0.000us         0.00%     757.239us     126.207us             6  
+                                            aten::copy_         1.36%      45.129us        58.52%       1.947ms     324.453us     682.520us        47.99%     757.239us     126.207us             6  
+                                           aten::conv1d         0.16%       5.420us         3.47%     115.432us      38.477us       0.000us         0.00%     739.832us     246.611us             3  
+                                      aten::convolution         0.28%       9.180us         3.31%     110.012us      36.671us       0.000us         0.00%     739.832us     246.611us             3  
+                                     aten::_convolution         0.63%      21.009us         3.03%     100.832us      33.611us       0.000us         0.00%     739.832us     246.611us             3  
+                                aten::_conv_depthwise2d         0.66%      21.951us         1.95%      65.032us      21.677us     739.832us        52.01%     739.832us     246.611us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     739.832us        52.01%     739.832us     246.611us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     397.628us        27.96%     397.628us     132.543us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     284.892us        20.03%     284.892us      94.964us             3  
+                                Activity Buffer Request        51.86%       1.725ms        51.86%       1.725ms       1.725ms      74.719us         5.25%      74.719us      74.719us             1  
+                                    aten::empty_strided         0.88%      29.200us         0.88%      29.200us       4.867us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         6.01%     199.804us         6.01%     199.804us      22.200us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.50%      16.619us         0.66%      21.940us       2.438us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.27%       8.971us         0.27%       8.971us       0.598us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.30%      10.051us         0.30%      10.051us       3.350us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.29%       9.641us         0.29%       9.641us       3.214us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.16%       5.171us         0.19%       6.481us       2.160us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 3.416ms
-Self CUDA time total: 1.429ms
+Self CPU time total: 3.327ms
+Self CUDA time total: 1.422ms
 
 
 impl                     wl                  p50(ms)  ok
@@ -4775,7 +4775,7 @@ torch_eager              cuda_B2_D64_S512_W2     0.08  True
 torch_eager              cuda_B2_D64_S512_W4     0.08  True
 torch_eager              cuda_B4_D2048_S128_W2     0.08  True
 torch_eager              cuda_B4_D2048_S128_W4     0.08  True
-torch_eager              cuda_B4_D2048_S2048_W2     0.49  True
+torch_eager              cuda_B4_D2048_S2048_W2     0.48  True
 torch_eager              cuda_B4_D2048_S2048_W4     0.50  True
 torch_eager              cuda_B4_D2048_S512_W2     0.09  True
 torch_eager              cuda_B4_D2048_S512_W4     0.10  True
@@ -4789,7 +4789,53 @@ torch_eager              cuda_B4_D64_S512_W4     0.08  True
 
▶ UV Install Logs
diff --git a/causal_conv1d/results/artifacts/combine/latency.svg b/causal_conv1d/results/artifacts/combine/latency.svg index 9d001b43fc09013b8b5b52409c7d90ceea48c347..725a6fb4dc222a3218bb822f1e70a35908a47885 100644 --- a/causal_conv1d/results/artifacts/combine/latency.svg +++ b/causal_conv1d/results/artifacts/combine/latency.svg @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a2617076455d3985f32d3652d376c64caac8acd7513e105352d8ccd515d5c005 -size 35431 +oid sha256:2ee8e4503bfdd426f73797bc1dc8282f57f594087b5fe7c44c74d67c14a07ba6 +size 35420 diff --git a/causal_conv1d/results/combined_results.html b/causal_conv1d/results/combined_results.html index 0e9a34ec4ca56343374b5033d35ba2ec30274059..0578da3d7c4fd4091fc0ac0acedb17bad523bb39 100644 --- a/causal_conv1d/results/combined_results.html +++ b/causal_conv1d/results/combined_results.html @@ -3889,7 +3889,7 @@ body[data-tool="eraser"] .main-content { - 2025-12-19T19:55:43.820965 + 2025-12-19T23:02:31.637981 image/svg+xml @@ -4233,70 +4233,70 @@ body[data-tool="eraser"] .main-content { - + - + - 0.1 + 0.1 - + - + - 0.2 + 0.2 - + - + - 0.3 + 0.3 - + - + - 0.4 + 0.4 - + - + - 0.5 + 0.5 @@ -4304,66 +4304,66 @@ body[data-tool="eraser"] .main-content { - + - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + - + - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + @@ -4422,7 +4422,7 @@ body[data-tool="eraser"] .main-content { ▼ output ▶ uv-logs | -Cell: combine | 4.64s +Cell: combine | 4.67s | Raw @@ -4547,7 +4547,7 @@ torch_eager cuda_B2_D64_S512_W2 0.08 True torch_eager cuda_B2_D64_S512_W4 0.08 True torch_eager cuda_B4_D2048_S128_W2 0.08 True torch_eager cuda_B4_D2048_S128_W4 0.08 True -torch_eager cuda_B4_D2048_S2048_W2 0.49 True +torch_eager cuda_B4_D2048_S2048_W2 0.48 True torch_eager cuda_B4_D2048_S2048_W4 0.50 True torch_eager cuda_B4_D2048_S512_W2 0.09 True torch_eager cuda_B4_D2048_S512_W4 0.10 True @@ -4576,7 +4576,7 @@ Implementations included:
▶ UV Install Logs
@@ -4589,7 +4589,7 @@ Installed 37 packages in 204ms - 2025-12-19T19:55:43.820965 + 2025-12-19T23:02:31.637981 image/svg+xml @@ -4933,70 +4933,70 @@ Installed 37 packages in 204ms - + - + - 0.1 + 0.1 - + - + - 0.2 + 0.2 - + - + - 0.3 + 0.3 - + - + - 0.4 + 0.4 - + - + - 0.5 + 0.5 @@ -5004,66 +5004,66 @@ Installed 37 packages in 204ms - + - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + - + - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + diff --git a/deformable_detr/impls/artifacts/benchmark/deformable_detr.jsonl b/deformable_detr/impls/artifacts/benchmark/deformable_detr.jsonl index 88e74fbc1e8be98d02656f976550fad6655fd361..b7c596207831092b886a1ffd17820448062fcfcb 100644 --- a/deformable_detr/impls/artifacts/benchmark/deformable_detr.jsonl +++ b/deformable_detr/impls/artifacts/benchmark/deformable_detr.jsonl @@ -1,4 +1,4 @@ -{"ts": "2025-12-19T19:41:40Z", "run": "e6f28dfc458847cc825acfc40a1937dc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_Q100_H8_E256_L4_P4", "batch_size": 1, "num_queries": 100, "num_heads": 8, "embed_dim": 256, "num_levels": 4, "num_points": 4, "spatial_shapes": [[32, 32], [16, 16], [8, 8], [4, 4]], "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.4261789999732173, "p50": 3.457469000011315, "p90": 3.459429999963959, "mean": 3.4539671999937127, "iqr": 0.0022309999962999427, "raw_times": [3.459429999963959, 3.4695590000524135, 3.457469000011315, 3.457198999967659, 3.4261789999732173], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.4921710000048733, "peak_bytes": 5929472, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "deformable_detr_torch"}, "err": null} -{"ts": "2025-12-19T19:41:40Z", "run": "e6f28dfc458847cc825acfc40a1937dc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_Q300_H8_E256_L4_P4", "batch_size": 1, "num_queries": 300, "num_heads": 8, "embed_dim": 256, "num_levels": 4, "num_points": 4, "spatial_shapes": [[32, 32], [16, 16], [8, 8], [4, 4]], "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 4.229746999953932, "p50": 4.235096999991583, "p90": 4.236528000035378, "mean": 4.242877599995154, "iqr": 0.0027010000280824897, "raw_times": [4.236528000035378, 4.235096999991583, 4.279188999987582, 4.233827000007295, 4.229746999953932], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 4.256637999958457, "peak_bytes": 15161856, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "deformable_detr_torch"}, "err": null} -{"ts": "2025-12-19T19:41:41Z", "run": "e6f28dfc458847cc825acfc40a1937dc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_Q100_H8_E256_L4_P4", "batch_size": 2, "num_queries": 100, "num_heads": 8, "embed_dim": 256, "num_levels": 4, "num_points": 4, "spatial_shapes": [[32, 32], [16, 16], [8, 8], [4, 4]], "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 4.2112570000085725, "p50": 4.251798000041163, "p90": 4.262317999973675, "mean": 4.247635800004446, "iqr": 0.04195999997591571, "raw_times": [4.2112570000085725, 4.262317999973675, 4.220357999997759, 4.251798000041163, 4.292448000001059], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 4.28278900000123, "peak_bytes": 11958784, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "deformable_detr_torch"}, "err": null} -{"ts": "2025-12-19T19:41:41Z", "run": "e6f28dfc458847cc825acfc40a1937dc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_Q300_H8_E256_L4_P4", "batch_size": 2, "num_queries": 300, "num_heads": 8, "embed_dim": 256, "num_levels": 4, "num_points": 4, "spatial_shapes": [[32, 32], [16, 16], [8, 8], [4, 4]], "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 4.324839999981123, "p50": 4.342328999996425, "p90": 4.349751000006563, "mean": 4.3508561999942685, "iqr": 0.022982000018600957, "raw_times": [4.326768999987962, 4.410591999999269, 4.342328999996425, 4.324839999981123, 4.349751000006563], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 4.316158999984054, "peak_bytes": 30977024, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "deformable_detr_torch"}, "err": null} +{"ts": "2025-12-19T23:02:21Z", "run": "bd3674eb0704484693460041fd14f59b", "impl": "hf_kernels_deformable_detr", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_Q100_H8_E256_L4_P4", "batch_size": 1, "num_queries": 100, "num_heads": 8, "embed_dim": 256, "num_levels": 4, "num_points": 4, "spatial_shapes": [[32, 32], [16, 16], [8, 8], [4, 4]], "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.03611099987210764, "p50": 0.037491000057343626, "p90": 0.038670999856549315, "mean": 0.03807699995377334, "iqr": 0.0014299998838396277, "raw_times": [0.04087100001015642, 0.038670999856549315, 0.03724099997270969, 0.037491000057343626, 0.03611099987210764], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04587100011121947, "peak_bytes": 2264064, "ok": true, "absmax": 7.152557373046875e-07, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax": 7.152557373046875e-07, "mae": 5.539113701047427e-08, "mse": 6.418638644407112e-15, "ref": "deformable_detr_torch"}, "err": null} +{"ts": "2025-12-19T23:02:21Z", "run": "bd3674eb0704484693460041fd14f59b", "impl": "hf_kernels_deformable_detr", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_Q300_H8_E256_L4_P4", "batch_size": 1, "num_queries": 300, "num_heads": 8, "embed_dim": 256, "num_levels": 4, "num_points": 4, "spatial_shapes": [[32, 32], [16, 16], [8, 8], [4, 4]], "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0427610000315326, "p50": 0.04391099992062664, "p90": 0.04453099995771481, "mean": 0.043983000023217755, "iqr": 0.0007099997674231417, "raw_times": [0.0427610000315326, 0.04453099995771481, 0.044891000015923055, 0.04382100019029167, 0.04391099992062664], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04723100005321612, "peak_bytes": 4004864, "ok": true, "absmax": 7.152557373046875e-07, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax": 7.152557373046875e-07, "mae": 5.559346050176828e-08, "mse": 6.4289483059246175e-15, "ref": "deformable_detr_torch"}, "err": null} +{"ts": "2025-12-19T23:02:21Z", "run": "bd3674eb0704484693460041fd14f59b", "impl": "hf_kernels_deformable_detr", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_Q100_H8_E256_L4_P4", "batch_size": 2, "num_queries": 100, "num_heads": 8, "embed_dim": 256, "num_levels": 4, "num_points": 4, "spatial_shapes": [[32, 32], [16, 16], [8, 8], [4, 4]], "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04207100005260145, "p50": 0.04387099988889531, "p90": 0.044481000031737494, "mean": 0.04371499999251682, "iqr": 0.0019200001588615123, "raw_times": [0.04387099988889531, 0.044481000031737494, 0.04559100011647388, 0.04256099987287598, 0.04207100005260145], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04643099987333699, "peak_bytes": 5459968, "ok": true, "absmax": 7.152557373046875e-07, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax": 7.152557373046875e-07, "mae": 5.555110149657594e-08, "mse": 6.418781369458724e-15, "ref": "deformable_detr_torch"}, "err": null} +{"ts": "2025-12-19T23:02:21Z", "run": "bd3674eb0704484693460041fd14f59b", "impl": "hf_kernels_deformable_detr", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_Q300_H8_E256_L4_P4", "batch_size": 2, "num_queries": 300, "num_heads": 8, "embed_dim": 256, "num_levels": 4, "num_points": 4, "spatial_shapes": [[32, 32], [16, 16], [8, 8], [4, 4]], "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04534100003183994, "p50": 0.04615100010596507, "p90": 0.04615100010596507, "mean": 0.045852800030843355, "iqr": 0.0007410001217067474, "raw_times": [0.04615100010596507, 0.04540999998425832, 0.04615100010596507, 0.04534100003183994, 0.04621099992618838], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04675100012718758, "peak_bytes": 8008704, "ok": true, "absmax": 7.152557373046875e-07, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax": 7.152557373046875e-07, "mae": 5.5905669427147586e-08, "mse": 6.485184940875199e-15, "ref": "deformable_detr_torch"}, "err": null} diff --git a/deformable_detr/impls/cells/benchmark.py b/deformable_detr/impls/cells/benchmark.py index 66ccdf2085524240060089c8658a5256c484037b..a6ea7d22435520c378cc6db9da28875b23c10414 100644 --- a/deformable_detr/impls/cells/benchmark.py +++ b/deformable_detr/impls/cells/benchmark.py @@ -4,6 +4,7 @@ # "numpy", # "torch==2.8.0", # "kernels-benchmark-tools", +# "kernels", # ] # # [tool.uv.sources] @@ -12,107 +13,30 @@ import torch import sys from kernels_benchmark_tools import KernelTypeEnum, run_benchmark +from kernels import get_kernel +# Load the deformable DETR kernel +deformable_detr = get_kernel("kernels-community/deformable-detr") -def torch_deformable_detr( + +def hf_kernels_deformable_detr( value, spatial_shapes, level_start_index, sampling_locations, attention_weights, im2col_step=64 ): - """ - PyTorch native reference implementation of multi-scale deformable attention. - Uses vectorized bilinear interpolation for reasonable performance. - """ - bs, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape - _, _, _, channels = value.shape - - output = torch.zeros(bs, num_queries, num_heads, channels, device=value.device, dtype=value.dtype) - - # Split value tensor by levels - value_list = value.split([int(h * w) for h, w in spatial_shapes.tolist()], dim=1) - - # Iterate through each level (can't avoid this loop easily) - for level_idx in range(num_levels): - h, w = spatial_shapes[level_idx].tolist() - value_level = value_list[level_idx] # (bs, h*w, num_heads, channels) - - # Reshape to spatial grid: (bs, num_heads, channels, h, w) - value_spatial = value_level.reshape(bs, h, w, num_heads, channels).permute(0, 3, 4, 1, 2) - - # Get sampling locations and weights for this level - # loc: (bs, num_queries, num_heads, num_points, 2) - loc = sampling_locations[:, :, :, level_idx, :, :] - # weight: (bs, num_queries, num_heads, num_points) - weight = attention_weights[:, :, :, level_idx, :] - - # Convert normalized coordinates to pixel coordinates - # loc[..., 0] is x (width), loc[..., 1] is y (height) - x = loc[..., 0] * w - 0.5 # (bs, num_queries, num_heads, num_points) - y = loc[..., 1] * h - 0.5 - - # Get integer coordinates for bilinear interpolation - x0 = torch.floor(x).long() - y0 = torch.floor(y).long() - x1 = x0 + 1 - y1 = y0 + 1 - - # Compute interpolation weights BEFORE clamping (important!) - lw = x - x0.float() # weight for x direction - lh = y - y0.float() # weight for y direction - hw = 1 - lw - hh = 1 - lh - - # Create mask for valid sample locations - valid = (y > -1) & (x > -1) & (y < h) & (x < w) - - # Create masks for each corner being in bounds - mask_tl = ((y0 >= 0) & (x0 >= 0)).unsqueeze(-1).float() - mask_tr = ((y0 >= 0) & (x1 <= w - 1)).unsqueeze(-1).float() - mask_bl = ((y1 <= h - 1) & (x0 >= 0)).unsqueeze(-1).float() - mask_br = ((y1 <= h - 1) & (x1 <= w - 1)).unsqueeze(-1).float() - - # Clamp coordinates for safe indexing - x0_clamped = torch.clamp(x0, 0, w - 1) - x1_clamped = torch.clamp(x1, 0, w - 1) - y0_clamped = torch.clamp(y0, 0, h - 1) - y1_clamped = torch.clamp(y1, 0, h - 1) - - # Bilinear interpolation weights for all 4 corners - w_tl = (hh * hw).unsqueeze(-1) # top-left: (bs, num_queries, num_heads, num_points, 1) - w_tr = (hh * lw).unsqueeze(-1) # top-right - w_bl = (lh * hw).unsqueeze(-1) # bottom-left - w_br = (lh * lw).unsqueeze(-1) # bottom-right - - # Gather values from the 4 corners using advanced indexing - batch_idx = torch.arange(bs, device=value.device).view(bs, 1, 1, 1).expand(bs, num_queries, num_heads, num_points) - head_idx = torch.arange(num_heads, device=value.device).view(1, 1, num_heads, 1).expand(bs, num_queries, num_heads, num_points) - - # Gather corner values with clamped indices, then apply corner masks - v_tl = value_spatial[batch_idx, head_idx, :, y0_clamped, x0_clamped] * mask_tl - v_tr = value_spatial[batch_idx, head_idx, :, y0_clamped, x1_clamped] * mask_tr - v_bl = value_spatial[batch_idx, head_idx, :, y1_clamped, x0_clamped] * mask_bl - v_br = value_spatial[batch_idx, head_idx, :, y1_clamped, x1_clamped] * mask_br - - # Bilinear interpolation - sampled = w_tl * v_tl + w_tr * v_tr + w_bl * v_bl + w_br * v_br - - # Apply valid mask (only accumulate if entire sample location is valid) - sampled = sampled * valid.unsqueeze(-1).float() - - # Apply attention weights and sum over points - # weight: (bs, num_queries, num_heads, num_points) - # Expand weight: (bs, num_queries, num_heads, num_points, 1) - weighted_sampled = sampled * weight.unsqueeze(-1) - - # Sum over points: (bs, num_queries, num_heads, channels) - output += weighted_sampled.sum(dim=3) - - # Flatten last two dimensions to match kernel output - return output.reshape(bs, num_queries, num_heads * channels) + """HuggingFace Kernels Deformable DETR Multi-Scale Deformable Attention""" + return deformable_detr.ms_deform_attn_forward( + value=value, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + sampling_loc=sampling_locations, + attn_weight=attention_weights, + im2col_step=im2col_step + ) run_benchmark( kernel_type=KernelTypeEnum.DEFORMABLE_DETR, - impl_name="torch_eager", - impl_tags={"family": "pytorch", "backend": "eager"}, - impl_func=torch_deformable_detr, + impl_name="hf_kernels_deformable_detr", + impl_tags={"family": "hf-kernels", "backend": "cuda"}, + impl_func=hf_kernels_deformable_detr, dtype="float32", ) \ No newline at end of file diff --git a/deformable_detr/impls/hf_kernels_deformable_detr.html b/deformable_detr/impls/hf_kernels_deformable_detr.html index b8ada08c6f5a9425e532517dd7d73b5fd53f96a6..2bae2e97caa2e0247f01b6113be524eeb076d21c 100644 --- a/deformable_detr/impls/hf_kernels_deformable_detr.html +++ b/deformable_detr/impls/hf_kernels_deformable_detr.html @@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content { ▼ output ▶ uv-logs | -Cell: nv | 0.28s +Cell: nv | 0.25s | Raw @@ -3905,7 +3905,7 @@ Cell: nv | 0.28s
-
Fri Dec 19 19:41:27 2025       
+
Fri Dec 19 23:02:11 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 580.105.08             Driver Version: 580.105.08     CUDA Version: 13.0     |
 +-----------------------------------------+------------------------+----------------------+
@@ -3914,7 +3914,7 @@ Cell: nv | 0.28s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   32C    P0            120W /  350W |       0MiB /  46068MiB |     92%      Default |
+| N/A   42C    P0             83W /  350W |       0MiB /  46068MiB |     12%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -3938,7 +3938,7 @@ Cell: nv | 0.28s
 ▼ output
  ▶ uv-logs
  | 
-Cell: benchmark | 8.66s
+Cell: benchmark | 4.69s
  | 
 
 Raw
@@ -4003,24 +4003,24 @@ PROFILE TRACE: hf_kernels_deformable_detr | cuda_B1_Q100_H8_E256_L4_P4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                             hf_kernels_deformable_detr         0.00%       0.000us         0.00%       0.000us       0.000us     191.775us       760.50%     191.775us     191.775us             1  
-                             hf_kernels_deformable_detr         6.53%     139.932us        99.65%       2.134ms       2.134ms       0.000us         0.00%      26.274us      26.274us             1  
-       _deformable_detr_57c3d32::ms_deform_attn_forward         3.14%      67.151us        93.12%       1.994ms     664.639us      22.336us        88.58%      26.274us       8.758us             3  
-void ms_deformable_im2col_gpu_kernel<float>(int, flo...         0.00%       0.000us         0.00%       0.000us       0.000us      22.336us        88.58%      22.336us       7.445us             3  
-                                            aten::zeros         0.92%      19.641us        87.16%       1.866ms     622.148us       0.000us         0.00%       3.938us       1.313us             3  
-                                            aten::zero_         0.66%      14.050us        84.59%       1.811ms     603.774us       0.000us         0.00%       3.938us       1.313us             3  
-                                            aten::fill_         1.47%      31.401us        83.93%       1.797ms     599.090us       2.881us        11.42%       3.938us       1.313us             3  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.881us        11.42%       2.881us       0.960us             3  
-                                Activity Buffer Request        80.47%       1.723ms        80.47%       1.723ms       1.723ms       1.057us         4.19%       1.057us       1.057us             1  
-                                            aten::empty         1.66%      35.481us         1.66%      35.481us      11.827us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         2.78%      59.511us         2.78%      59.511us       9.919us       0.000us         0.00%       0.000us       0.000us             6  
-                                             aten::view         0.84%      17.931us         0.84%      17.931us       2.989us       0.000us         0.00%       0.000us       0.000us             6  
-                                           aten::select         1.00%      21.440us         1.20%      25.621us       8.540us       0.000us         0.00%       0.000us       0.000us             3  
-                                       aten::as_strided         0.20%       4.181us         0.20%       4.181us       1.394us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.35%       7.461us         0.35%       7.461us       7.461us       0.000us         0.00%       0.000us       0.000us             1  
+                             hf_kernels_deformable_detr         0.00%       0.000us         0.00%       0.000us       0.000us     179.167us       708.76%     179.167us     179.167us             1  
+                             hf_kernels_deformable_detr         6.05%     126.291us        99.56%       2.078ms       2.078ms       0.000us         0.00%      26.335us      26.335us             1  
+       _deformable_detr_57c3d32::ms_deform_attn_forward         2.99%      62.312us        93.50%       1.951ms     650.448us      22.366us        88.48%      26.335us       8.778us             3  
+void ms_deformable_im2col_gpu_kernel<float>(int, flo...         0.00%       0.000us         0.00%       0.000us       0.000us      22.366us        88.48%      22.366us       7.455us             3  
+                                            aten::zeros         0.88%      18.443us        87.81%       1.832ms     610.824us       0.000us         0.00%       3.969us       1.323us             3  
+                                            aten::zero_         0.60%      12.470us        85.41%       1.782ms     594.116us       0.000us         0.00%       3.969us       1.323us             3  
+                                            aten::fill_         1.40%      29.180us        84.81%       1.770ms     589.959us       2.913us        11.52%       3.969us       1.323us             3  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.913us        11.52%       2.913us       0.971us             3  
+                                Activity Buffer Request        81.41%       1.699ms        81.41%       1.699ms       1.699ms       1.056us         4.18%       1.056us       1.056us             1  
+                                            aten::empty         1.52%      31.680us         1.52%      31.680us      10.560us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         2.90%      60.481us         2.90%      60.481us      10.080us       0.000us         0.00%       0.000us       0.000us             6  
+                                             aten::view         0.77%      16.170us         0.77%      16.170us       2.695us       0.000us         0.00%       0.000us       0.000us             6  
+                                           aten::select         0.87%      18.140us         1.04%      21.670us       7.223us       0.000us         0.00%       0.000us       0.000us             3  
+                                       aten::as_strided         0.17%       3.530us         0.17%       3.530us       1.177us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.44%       9.280us         0.44%       9.280us       9.280us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.141ms
-Self CUDA time total: 25.217us
+Self CPU time total: 2.087ms
+Self CUDA time total: 25.279us
 
 
 
@@ -4030,24 +4030,24 @@ PROFILE TRACE: hf_kernels_deformable_detr | cuda_B1_Q300_H8_E256_L4_P4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                             hf_kernels_deformable_detr         0.00%       0.000us         0.00%       0.000us       0.000us     140.638us       537.96%     140.638us     140.638us             1  
-                             hf_kernels_deformable_detr         3.75%      74.302us        99.73%       1.975ms       1.975ms       0.000us         0.00%      27.071us      27.071us             1  
-       _deformable_detr_57c3d32::ms_deform_attn_forward         1.66%      32.812us        95.98%       1.901ms     633.661us      23.327us        89.23%      27.071us       9.024us             3  
-void ms_deformable_im2col_gpu_kernel<float>(int, flo...         0.00%       0.000us         0.00%       0.000us       0.000us      23.327us        89.23%      23.327us       7.776us             3  
-                                            aten::zeros         0.45%       8.890us        92.43%       1.831ms     610.224us       0.000us         0.00%       3.744us       1.248us             3  
-                                            aten::zero_         0.40%       7.970us        91.07%       1.804ms     601.294us       0.000us         0.00%       3.744us       1.248us             3  
-                                            aten::fill_         1.26%      24.969us        90.67%       1.796ms     598.637us       2.816us        10.77%       3.744us       1.248us             3  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.816us        10.77%       2.816us       0.939us             3  
-                                Activity Buffer Request        88.11%       1.745ms        88.11%       1.745ms       1.745ms       0.928us         3.55%       0.928us       0.928us             1  
-                                            aten::empty         0.90%      17.900us         0.90%      17.900us       5.967us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         2.05%      40.542us         2.05%      40.542us       6.757us       0.000us         0.00%       0.000us       0.000us             6  
-                                             aten::view         0.46%       9.070us         0.46%       9.070us       1.512us       0.000us         0.00%       0.000us       0.000us             6  
-                                           aten::select         0.58%      11.410us         0.69%      13.720us       4.573us       0.000us         0.00%       0.000us       0.000us             3  
-                                       aten::as_strided         0.12%       2.310us         0.12%       2.310us       0.770us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.27%       5.400us         0.27%       5.400us       5.400us       0.000us         0.00%       0.000us       0.000us             1  
+                             hf_kernels_deformable_detr         0.00%       0.000us         0.00%       0.000us       0.000us     134.144us       513.10%     134.144us     134.144us             1  
+                             hf_kernels_deformable_detr         4.75%      94.541us        99.73%       1.985ms       1.985ms       0.000us         0.00%      27.072us      27.072us             1  
+       _deformable_detr_57c3d32::ms_deform_attn_forward         1.59%      31.632us        94.98%       1.890ms     630.031us      23.360us        89.35%      27.072us       9.024us             3  
+void ms_deformable_im2col_gpu_kernel<float>(int, flo...         0.00%       0.000us         0.00%       0.000us       0.000us      23.360us        89.35%      23.360us       7.787us             3  
+                                            aten::zeros         0.38%       7.548us        91.51%       1.821ms     607.010us       0.000us         0.00%       3.712us       1.237us             3  
+                                            aten::zero_         0.42%       8.279us        90.32%       1.797ms     599.120us       0.000us         0.00%       3.712us       1.237us             3  
+                                            aten::fill_         1.23%      24.533us        89.90%       1.789ms     596.360us       2.784us        10.65%       3.712us       1.237us             3  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.784us        10.65%       2.784us       0.928us             3  
+                                Activity Buffer Request        87.33%       1.738ms        87.33%       1.738ms       1.738ms       0.928us         3.55%       0.928us       0.928us             1  
+                                            aten::empty         0.81%      16.122us         0.81%      16.122us       5.374us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         2.12%      42.110us         2.12%      42.110us       7.018us       0.000us         0.00%       0.000us       0.000us             6  
+                                             aten::view         0.47%       9.440us         0.47%       9.440us       1.573us       0.000us         0.00%       0.000us       0.000us             6  
+                                           aten::select         0.52%      10.420us         0.63%      12.530us       4.177us       0.000us         0.00%       0.000us       0.000us             3  
+                                       aten::as_strided         0.11%       2.110us         0.11%       2.110us       0.703us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.27%       5.390us         0.27%       5.390us       5.390us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.981ms
-Self CUDA time total: 26.143us
+Self CPU time total: 1.990ms
+Self CUDA time total: 26.144us
 
 
 
@@ -4057,24 +4057,24 @@ PROFILE TRACE: hf_kernels_deformable_detr | cuda_B2_Q100_H8_E256_L4_P4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                             hf_kernels_deformable_detr         0.00%       0.000us         0.00%       0.000us       0.000us     139.424us       546.70%     139.424us     139.424us             1  
-                             hf_kernels_deformable_detr         3.45%      67.322us        99.73%       1.947ms       1.947ms       0.000us         0.00%      26.463us      26.463us             1  
-       _deformable_detr_57c3d32::ms_deform_attn_forward         1.76%      34.371us        96.28%       1.880ms     626.621us      22.688us        88.96%      26.463us       8.821us             3  
-void ms_deformable_im2col_gpu_kernel<float>(int, flo...         0.00%       0.000us         0.00%       0.000us       0.000us      22.688us        88.96%      22.688us       7.563us             3  
-                                            aten::zeros         0.42%       8.159us        92.58%       1.808ms     602.514us       0.000us         0.00%       3.775us       1.258us             3  
-                                            aten::zero_         0.40%       7.880us        91.30%       1.783ms     594.177us       0.000us         0.00%       3.775us       1.258us             3  
-                                            aten::fill_         1.36%      26.500us        90.89%       1.775ms     591.551us       2.815us        11.04%       3.775us       1.258us             3  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.815us        11.04%       2.815us       0.938us             3  
-                                Activity Buffer Request        88.22%       1.722ms        88.22%       1.722ms       1.722ms       0.960us         3.76%       0.960us       0.960us             1  
-                                            aten::empty         0.86%      16.851us         0.86%      16.851us       5.617us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         2.08%      40.632us         2.08%      40.632us       6.772us       0.000us         0.00%       0.000us       0.000us             6  
-                                             aten::view         0.52%      10.080us         0.52%      10.080us       1.680us       0.000us         0.00%       0.000us       0.000us             6  
-                                           aten::select         0.55%      10.661us         0.66%      12.960us       4.320us       0.000us         0.00%       0.000us       0.000us             3  
-                                       aten::as_strided         0.12%       2.299us         0.12%       2.299us       0.766us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.27%       5.270us         0.27%       5.270us       5.270us       0.000us         0.00%       0.000us       0.000us             1  
+                             hf_kernels_deformable_detr         0.00%       0.000us         0.00%       0.000us       0.000us     132.927us       521.86%     132.927us     132.927us             1  
+                             hf_kernels_deformable_detr         4.64%      88.002us        99.69%       1.889ms       1.889ms       0.000us         0.00%      26.432us      26.432us             1  
+       _deformable_detr_57c3d32::ms_deform_attn_forward         1.65%      31.271us        95.05%       1.801ms     600.270us      22.624us        88.82%      26.432us       8.811us             3  
+void ms_deformable_im2col_gpu_kernel<float>(int, flo...         0.00%       0.000us         0.00%       0.000us       0.000us      22.624us        88.82%      22.624us       7.541us             3  
+                                            aten::zeros         0.45%       8.600us        91.43%       1.732ms     577.433us       0.000us         0.00%       3.808us       1.269us             3  
+                                            aten::zero_         0.42%       7.879us        90.13%       1.708ms     569.182us       0.000us         0.00%       3.808us       1.269us             3  
+                                            aten::fill_         1.34%      25.390us        89.71%       1.700ms     566.556us       2.848us        11.18%       3.808us       1.269us             3  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.848us        11.18%       2.848us       0.949us             3  
+                                Activity Buffer Request        87.00%       1.648ms        87.00%       1.648ms       1.648ms       0.960us         3.77%       0.960us       0.960us             1  
+                                            aten::empty         0.85%      16.152us         0.85%      16.152us       5.384us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         2.16%      40.982us         2.16%      40.982us       6.830us       0.000us         0.00%       0.000us       0.000us             6  
+                                             aten::view         0.49%       9.259us         0.49%       9.259us       1.543us       0.000us         0.00%       0.000us       0.000us             6  
+                                           aten::select         0.57%      10.851us         0.68%      12.901us       4.300us       0.000us         0.00%       0.000us       0.000us             3  
+                                       aten::as_strided         0.11%       2.050us         0.11%       2.050us       0.683us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.31%       5.790us         0.31%       5.790us       5.790us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.952ms
-Self CUDA time total: 25.503us
+Self CPU time total: 1.895ms
+Self CUDA time total: 25.472us
 
 
 
@@ -4084,42 +4084,42 @@ PROFILE TRACE: hf_kernels_deformable_detr | cuda_B2_Q300_H8_E256_L4_P4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                             hf_kernels_deformable_detr         0.00%       0.000us         0.00%       0.000us       0.000us     144.383us       310.53%     144.383us     144.383us             1  
-                             hf_kernels_deformable_detr         3.20%      70.383us        99.77%       2.197ms       2.197ms       0.000us         0.00%      47.520us      47.520us             1  
-       _deformable_detr_57c3d32::ms_deform_attn_forward         1.51%      33.251us        96.57%       2.127ms     709.009us      43.392us        93.32%      47.520us      15.840us             3  
-void ms_deformable_im2col_gpu_kernel<float>(int, flo...         0.00%       0.000us         0.00%       0.000us       0.000us      43.392us        93.32%      43.392us      14.464us             3  
-                                            aten::zeros         0.36%       7.853us        93.39%       2.057ms     685.609us       0.000us         0.00%       4.128us       1.376us             3  
-                                            aten::zero_         0.36%       8.030us        92.24%       2.032ms     677.202us       0.000us         0.00%       4.128us       1.376us             3  
-                                            aten::fill_         1.13%      24.791us        91.88%       2.024ms     674.525us       3.104us         6.68%       4.128us       1.376us             3  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       3.104us         6.68%       3.104us       1.035us             3  
-                                Activity Buffer Request        79.51%       1.751ms        79.51%       1.751ms       1.751ms       1.024us         2.20%       1.024us       1.024us             1  
-                                            aten::empty         0.79%      17.369us         0.79%      17.369us       5.790us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        11.88%     261.685us        11.88%     261.685us      43.614us       0.000us         0.00%       0.000us       0.000us             6  
-                                             aten::view         0.43%       9.529us         0.43%       9.529us       1.588us       0.000us         0.00%       0.000us       0.000us             6  
-                                           aten::select         0.50%      10.960us         0.60%      13.220us       4.407us       0.000us         0.00%       0.000us       0.000us             3  
-                                       aten::as_strided         0.10%       2.260us         0.10%       2.260us       0.753us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.23%       5.101us         0.23%       5.101us       5.101us       0.000us         0.00%       0.000us       0.000us             1  
+                             hf_kernels_deformable_detr         0.00%       0.000us         0.00%       0.000us       0.000us     141.952us       303.01%     141.952us     141.952us             1  
+                             hf_kernels_deformable_detr         4.29%      94.562us        99.77%       2.200ms       2.200ms       0.000us         0.00%      47.871us      47.871us             1  
+       _deformable_detr_57c3d32::ms_deform_attn_forward         1.45%      32.013us        95.49%       2.106ms     701.872us      43.744us        93.38%      47.871us      15.957us             3  
+void ms_deformable_im2col_gpu_kernel<float>(int, flo...         0.00%       0.000us         0.00%       0.000us       0.000us      43.744us        93.38%      43.744us      14.581us             3  
+                                            aten::zeros         0.35%       7.690us        92.40%       2.038ms     679.194us       0.000us         0.00%       4.127us       1.376us             3  
+                                            aten::zero_         0.37%       8.230us        91.34%       2.014ms     671.361us       0.000us         0.00%       4.127us       1.376us             3  
+                                            aten::fill_         1.11%      24.520us        90.96%       2.006ms     668.618us       3.103us         6.62%       4.127us       1.376us             3  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       3.103us         6.62%       3.103us       1.034us             3  
+                                Activity Buffer Request        79.72%       1.758ms        79.72%       1.758ms       1.758ms       1.024us         2.19%       1.024us       1.024us             1  
+                                            aten::empty         0.72%      15.810us         0.72%      15.810us       5.270us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        10.76%     237.325us        10.76%     237.325us      39.554us       0.000us         0.00%       0.000us       0.000us             6  
+                                             aten::view         0.42%       9.159us         0.42%       9.159us       1.527us       0.000us         0.00%       0.000us       0.000us             6  
+                                           aten::select         0.49%      10.790us         0.58%      12.870us       4.290us       0.000us         0.00%       0.000us       0.000us             3  
+                                       aten::as_strided         0.09%       2.080us         0.09%       2.080us       0.693us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.23%       4.980us         0.23%       4.980us       4.980us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.203ms
-Self CUDA time total: 46.496us
+Self CPU time total: 2.205ms
+Self CUDA time total: 46.847us
 
 
 impl                     wl                  p50(ms)  ok
 hf_kernels_deformable_detr cuda_B1_Q100_H8_E256_L4_P4     0.04  True
-hf_kernels_deformable_detr cuda_B1_Q300_H8_E256_L4_P4     0.05  True
-hf_kernels_deformable_detr cuda_B2_Q100_H8_E256_L4_P4     0.05  True
+hf_kernels_deformable_detr cuda_B1_Q300_H8_E256_L4_P4     0.04  True
+hf_kernels_deformable_detr cuda_B2_Q100_H8_E256_L4_P4     0.04  True
 hf_kernels_deformable_detr cuda_B2_Q300_H8_E256_L4_P4     0.05  True
 
▶ UV Install Logs
Fetching 7 files: 0%| | 0/7 [00:00<?, ?it/s] -Fetching 7 files: 14%|█▍ | 1/7 [00:00<00:00, 8.32it/s] -Fetching 7 files: 71%|███████▏ | 5/7 [00:00<00:00, 7.49it/s] -Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00, 10.54it/s]
+Fetching 7 files: 29%|██▊ | 2/7 [00:00<00:00, 16.18it/s] +Fetching 7 files: 71%|███████▏ | 5/7 [00:00<00:00, 9.41it/s] +Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00, 13.86it/s]

Artifacts:

deformable_detr.jsonl diff --git a/deformable_detr/impls/torch_deformable_detr.html b/deformable_detr/impls/torch_deformable_detr.html index a77e11471dfe058dd73f6c2becd778fac483cba7..32fe8c011a656fa62f1c1a5ce5fe97da5fc424c1 100644 --- a/deformable_detr/impls/torch_deformable_detr.html +++ b/deformable_detr/impls/torch_deformable_detr.html @@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content { ▼ output ▶ uv-logs | -Cell: nv | 0.28s +Cell: nv | 0.25s | Raw @@ -3904,7 +3904,7 @@ Cell: nv | 0.28s
-
Fri Dec 19 19:41:27 2025       
+
Fri Dec 19 23:02:11 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 580.105.08             Driver Version: 580.105.08     CUDA Version: 13.0     |
 +-----------------------------------------+------------------------+----------------------+
@@ -3913,7 +3913,7 @@ Cell: nv | 0.28s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   32C    P0            120W /  350W |       0MiB /  46068MiB |     92%      Default |
+| N/A   42C    P0             83W /  350W |       0MiB /  46068MiB |     12%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -3935,9 +3935,9 @@ Cell: nv | 0.28s
 
 ▼ code 
 ▼ output
- ▶ uv-logs
+ ▶ uv-logs
  | 
-Cell: benchmark | 5.49s
+Cell: benchmark | 9.26s
  | 
 
 Raw
@@ -4077,29 +4077,29 @@ PROFILE TRACE: torch_eager | cuda_B1_Q100_H8_E256_L4_P4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us      20.954ms      1412.31%      20.954ms      20.954ms             1  
-                                            torch_eager        20.81%       4.774ms        99.96%      22.930ms      22.930ms       0.000us         0.00%       1.485ms       1.485ms             1  
-                                            aten::index         4.58%       1.051ms        16.46%       3.775ms      78.637us     236.928us        15.97%     370.530us       7.719us            48  
-                                            aten::copy_         4.73%       1.085ms        11.22%       2.575ms      11.756us     365.953us        24.67%     365.953us       1.671us           219  
-                                              aten::mul         5.80%       1.330ms        10.04%       2.304ms      12.001us     294.214us        19.83%     294.214us       1.532us           192  
-void at::native::index_elementwise_kernel<128, 4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     236.928us        15.97%     236.928us       4.936us            48  
-                                               aten::to         0.58%     133.877us        11.21%       2.571ms      15.036us       0.000us         0.00%     232.351us       1.359us           171  
-                                         aten::_to_copy         2.31%     530.135us        10.63%       2.437ms      19.815us       0.000us         0.00%     232.351us       1.889us           123  
-void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     202.211us        13.63%     202.211us       1.685us           120  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     167.998us        11.32%     167.998us       2.000us            84  
-                                       aten::contiguous         0.35%      80.702us         8.37%       1.919ms      19.991us       0.000us         0.00%     133.602us       1.392us            96  
-                                            aten::clone         0.72%     165.584us         8.01%       1.838ms      19.151us       0.000us         0.00%     133.602us       1.392us            96  
-void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     133.602us         9.00%     133.602us       1.392us            96  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     115.712us         7.80%     115.712us       1.205us            96  
-                                          aten::__and__         0.62%     142.312us         4.46%       1.024ms      12.189us       0.000us         0.00%      99.106us       1.180us            84  
-                                      aten::bitwise_and         2.26%     518.769us         3.84%     881.597us      10.495us      99.106us         6.68%      99.106us       1.180us            84  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      99.106us         6.68%      99.106us       1.180us            84  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      86.240us         5.81%      86.240us       1.198us            72  
-                                              aten::sub         2.18%     500.017us         3.71%     850.631us      11.814us      79.203us         5.34%      79.203us       1.100us            72  
-                                              aten::add         1.61%     368.526us         2.74%     627.393us      10.457us      74.431us         5.02%      74.431us       1.241us            60  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us      19.976ms      1348.57%      19.976ms      19.976ms             1  
+                                            torch_eager        20.04%       4.395ms        99.96%      21.929ms      21.929ms       0.000us         0.00%       1.482ms       1.482ms             1  
+                                            aten::index         4.53%     992.766us        16.58%       3.638ms      75.786us     236.544us        15.97%     370.336us       7.715us            48  
+                                            aten::copy_         4.69%       1.028ms        11.56%       2.535ms      11.576us     366.053us        24.71%     366.053us       1.671us           219  
+                                              aten::mul         5.90%       1.295ms        10.04%       2.203ms      11.474us     293.531us        19.82%     293.531us       1.529us           192  
+void at::native::index_elementwise_kernel<128, 4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     236.544us        15.97%     236.544us       4.928us            48  
+                                               aten::to         0.58%     126.843us        11.27%       2.473ms      14.461us       0.000us         0.00%     232.261us       1.358us           171  
+                                         aten::_to_copy         1.95%     426.950us        10.69%       2.346ms      19.073us       0.000us         0.00%     232.261us       1.888us           123  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     201.821us        13.62%     201.821us       1.682us           120  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     167.778us        11.33%     167.778us       1.997us            84  
+                                       aten::contiguous         0.36%      78.966us         8.52%       1.869ms      19.471us       0.000us         0.00%     133.792us       1.394us            96  
+                                            aten::clone         0.74%     161.750us         8.16%       1.790ms      18.648us       0.000us         0.00%     133.792us       1.394us            96  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     133.792us         9.03%     133.792us       1.394us            96  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     115.553us         7.80%     115.553us       1.204us            96  
+                                          aten::__and__         0.42%      91.609us         4.49%     984.808us      11.724us       0.000us         0.00%      99.041us       1.179us            84  
+                                      aten::bitwise_and         2.54%     557.575us         4.07%     893.199us      10.633us      99.041us         6.69%      99.041us       1.179us            84  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      99.041us         6.69%      99.041us       1.179us            84  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      86.140us         5.82%      86.140us       1.196us            72  
+                                              aten::sub         2.17%     475.165us         3.61%     791.992us      11.000us      79.197us         5.35%      79.197us       1.100us            72  
+                                              aten::add         1.62%     354.490us         2.70%     592.103us       9.868us      74.334us         5.02%      74.334us       1.239us            60  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 22.938ms
-Self CUDA time total: 1.484ms
+Self CPU time total: 21.937ms
+Self CUDA time total: 1.481ms
 
 
 
@@ -4109,29 +4109,29 @@ PROFILE TRACE: torch_eager | cuda_B1_Q300_H8_E256_L4_P4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us      19.509ms      1221.49%      19.509ms      19.509ms             1  
-                                            torch_eager        19.85%       4.302ms        99.97%      21.668ms      21.668ms       0.000us         0.00%       1.598ms       1.598ms             1  
-                                            aten::index         4.46%     966.583us        16.34%       3.542ms      73.793us     250.148us        15.66%     382.462us       7.968us            48  
-                                            aten::copy_         4.88%       1.058ms        11.66%       2.528ms      11.545us     367.423us        23.01%     367.423us       1.678us           219  
-                                              aten::mul         5.89%       1.276ms        10.32%       2.236ms      11.647us     359.260us        22.49%     359.260us       1.871us           192  
-void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     267.420us        16.74%     267.420us       2.228us           120  
-void at::native::index_elementwise_kernel<128, 4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     250.148us        15.66%     250.148us       5.211us            48  
-                                               aten::to         0.54%     118.126us        10.89%       2.361ms      13.808us       0.000us         0.00%     235.109us       1.375us           171  
-                                         aten::_to_copy         1.87%     405.252us        10.35%       2.243ms      18.236us       0.000us         0.00%     235.109us       1.911us           123  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     169.767us        10.63%     169.767us       2.021us            84  
-                                       aten::contiguous         0.36%      77.869us         8.56%       1.855ms      19.322us       0.000us         0.00%     132.314us       1.378us            96  
-                                            aten::clone         0.77%     166.617us         8.20%       1.777ms      18.511us       0.000us         0.00%     132.314us       1.378us            96  
-void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     132.314us         8.28%     132.314us       1.378us            96  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     117.886us         7.38%     117.886us       1.228us            96  
-                                          aten::__and__         0.36%      78.606us         4.33%     937.927us      11.166us       0.000us         0.00%     105.249us       1.253us            84  
-                                      aten::bitwise_and         2.36%     512.411us         3.96%     859.321us      10.230us     105.249us         6.59%     105.249us       1.253us            84  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     105.249us         6.59%     105.249us       1.253us            84  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     104.480us         6.54%     104.480us       1.451us            72  
-                                              aten::add         1.62%     350.142us         2.81%     608.190us      10.136us      91.837us         5.75%      91.837us       1.531us            60  
-                                              aten::sub         2.30%     498.767us         3.88%     840.992us      11.680us      80.480us         5.04%      80.480us       1.118us            72  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us      19.069ms      1196.67%      19.069ms      19.069ms             1  
+                                            torch_eager        19.87%       4.152ms        99.97%      20.886ms      20.886ms       0.000us         0.00%       1.594ms       1.594ms             1  
+                                            aten::index         4.48%     935.232us        16.67%       3.483ms      72.569us     249.668us        15.67%     382.147us       7.961us            48  
+                                            aten::copy_         4.80%       1.003ms        11.85%       2.477ms      11.308us     366.556us        23.00%     366.556us       1.674us           219  
+                                              aten::mul         6.04%       1.262ms        10.39%       2.170ms      11.304us     358.714us        22.51%     358.714us       1.868us           192  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     267.167us        16.77%     267.167us       2.226us           120  
+void at::native::index_elementwise_kernel<128, 4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     249.668us        15.67%     249.668us       5.201us            48  
+                                               aten::to         0.60%     125.408us        11.23%       2.347ms      13.724us       0.000us         0.00%     234.077us       1.369us           171  
+                                         aten::_to_copy         1.87%     389.897us        10.63%       2.221ms      18.060us       0.000us         0.00%     234.077us       1.903us           123  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     169.728us        10.65%     169.728us       2.021us            84  
+                                       aten::contiguous         0.35%      74.120us         8.81%       1.840ms      19.167us       0.000us         0.00%     132.479us       1.380us            96  
+                                            aten::clone         0.79%     164.425us         8.45%       1.766ms      18.395us       0.000us         0.00%     132.479us       1.380us            96  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     132.479us         8.31%     132.479us       1.380us            96  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     117.475us         7.37%     117.475us       1.224us            96  
+                                          aten::__and__         0.44%      90.959us         4.50%     941.006us      11.202us       0.000us         0.00%     105.476us       1.256us            84  
+                                      aten::bitwise_and         2.49%     520.216us         4.07%     850.047us      10.120us     105.476us         6.62%     105.476us       1.256us            84  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     105.476us         6.62%     105.476us       1.256us            84  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     104.197us         6.54%     104.197us       1.447us            72  
+                                              aten::add         1.62%     338.151us         2.73%     570.998us       9.517us      91.678us         5.75%      91.678us       1.528us            60  
+                                              aten::sub         2.14%     447.777us         3.61%     754.447us      10.478us      80.286us         5.04%      80.286us       1.115us            72  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 21.675ms
-Self CUDA time total: 1.597ms
+Self CPU time total: 20.891ms
+Self CUDA time total: 1.593ms
 
 
 
@@ -4141,29 +4141,29 @@ PROFILE TRACE: torch_eager | cuda_B2_Q100_H8_E256_L4_P4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us      19.494ms      1266.42%      19.494ms      19.494ms             1  
-                                            torch_eager        20.07%       4.284ms        99.97%      21.345ms      21.345ms       0.000us         0.00%       1.540ms       1.540ms             1  
-                                            aten::index         4.57%     976.579us        16.61%       3.546ms      73.876us     243.229us        15.80%     377.664us       7.868us            48  
-                                            aten::copy_         4.96%       1.060ms        11.92%       2.545ms      11.623us     367.712us        23.89%     367.712us       1.679us           219  
-                                              aten::mul         6.15%       1.313ms        10.67%       2.278ms      11.865us     325.252us        21.13%     325.252us       1.694us           192  
-void at::native::index_elementwise_kernel<128, 4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     243.229us        15.80%     243.229us       5.067us            48  
-void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     234.241us        15.22%     234.241us       1.952us           120  
-                                               aten::to         0.55%     117.567us        11.05%       2.359ms      13.796us       0.000us         0.00%     233.277us       1.364us           171  
-                                         aten::_to_copy         1.93%     412.957us        10.50%       2.242ms      18.225us       0.000us         0.00%     233.277us       1.897us           123  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     168.702us        10.96%     168.702us       2.008us            84  
-                                       aten::contiguous         0.37%      78.560us         8.76%       1.871ms      19.493us       0.000us         0.00%     134.435us       1.400us            96  
-                                            aten::clone         0.72%     153.204us         8.40%       1.793ms      18.675us       0.000us         0.00%     134.435us       1.400us            96  
-void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     134.435us         8.73%     134.435us       1.400us            96  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     115.962us         7.53%     115.962us       1.208us            96  
-                                          aten::__and__         0.35%      74.950us         4.35%     927.999us      11.048us       0.000us         0.00%     104.006us       1.238us            84  
-                                      aten::bitwise_and         2.36%     503.597us         4.00%     853.049us      10.155us     104.006us         6.76%     104.006us       1.238us            84  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     104.006us         6.76%     104.006us       1.238us            84  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      95.808us         6.22%      95.808us       1.331us            72  
-                                              aten::add         1.68%     357.766us         2.90%     618.339us      10.306us      83.778us         5.44%      83.778us       1.396us            60  
-                                              aten::sub         2.21%     472.075us         3.83%     818.182us      11.364us      78.946us         5.13%      78.946us       1.096us            72  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us      19.677ms      1279.16%      19.677ms      19.677ms             1  
+                                            torch_eager        19.82%       4.280ms        99.97%      21.590ms      21.590ms       0.000us         0.00%       1.539ms       1.539ms             1  
+                                            aten::index         4.49%     970.701us        16.56%       3.576ms      74.506us     243.261us        15.81%     377.688us       7.868us            48  
+                                            aten::copy_         4.67%       1.008ms        11.52%       2.487ms      11.356us     367.898us        23.92%     367.898us       1.680us           219  
+                                              aten::mul         5.96%       1.287ms        10.22%       2.207ms      11.495us     324.384us        21.09%     324.384us       1.690us           192  
+void at::native::index_elementwise_kernel<128, 4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     243.261us        15.81%     243.261us       5.068us            48  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     233.533us        15.18%     233.533us       1.946us           120  
+                                               aten::to         0.57%     122.968us        11.17%       2.413ms      14.109us       0.000us         0.00%     233.471us       1.365us           171  
+                                         aten::_to_copy         1.93%     415.801us        10.60%       2.290ms      18.615us       0.000us         0.00%     233.471us       1.898us           123  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     169.053us        10.99%     169.053us       2.013us            84  
+                                       aten::contiguous         0.37%      80.833us         8.61%       1.859ms      19.360us       0.000us         0.00%     134.427us       1.400us            96  
+                                            aten::clone         0.74%     159.128us         8.23%       1.778ms      18.518us       0.000us         0.00%     134.427us       1.400us            96  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     134.427us         8.74%     134.427us       1.400us            96  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     115.871us         7.53%     115.871us       1.207us            96  
+                                          aten::__and__         0.43%      92.507us         4.50%     971.781us      11.569us       0.000us         0.00%     104.160us       1.240us            84  
+                                      aten::bitwise_and         2.49%     538.828us         4.07%     879.274us      10.468us     104.160us         6.77%     104.160us       1.240us            84  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     104.160us         6.77%     104.160us       1.240us            84  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      95.908us         6.23%      95.908us       1.332us            72  
+                                              aten::add         1.64%     354.089us         2.75%     594.321us       9.905us      83.684us         5.44%      83.684us       1.395us            60  
+                                              aten::sub         2.17%     468.302us         3.66%     789.975us      10.972us      79.297us         5.15%      79.297us       1.101us            72  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 21.351ms
-Self CUDA time total: 1.539ms
+Self CPU time total: 21.596ms
+Self CUDA time total: 1.538ms
 
 
 
@@ -4173,37 +4173,43 @@ PROFILE TRACE: torch_eager | cuda_B2_Q300_H8_E256_L4_P4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us      21.106ms      1191.42%      21.106ms      21.106ms             1  
-                                            torch_eager        20.48%       4.473ms        99.97%      21.833ms      21.833ms       0.000us         0.00%       1.773ms       1.773ms             1  
-                                              aten::mul         6.38%       1.394ms        11.03%       2.409ms      12.546us     451.910us        25.51%     451.910us       2.354us           192  
-                                            aten::index         4.81%       1.050ms        17.73%       3.872ms      80.660us     281.474us        15.89%     419.235us       8.734us            48  
-                                            aten::copy_         5.13%       1.119ms        12.00%       2.622ms      11.970us     371.967us        21.00%     371.967us       1.698us           219  
-void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     357.220us        20.17%     357.220us       2.977us           120  
-void at::native::index_elementwise_kernel<128, 4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     281.474us        15.89%     281.474us       5.864us            48  
-                                               aten::to         0.62%     134.727us        11.66%       2.546ms      14.889us       0.000us         0.00%     234.206us       1.370us           171  
-                                         aten::_to_copy         2.10%     458.958us        11.04%       2.411ms      19.605us       0.000us         0.00%     234.206us       1.904us           123  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     168.509us         9.51%     168.509us       2.006us            84  
-                                       aten::contiguous         0.48%     104.345us         9.14%       1.996ms      20.797us       0.000us         0.00%     137.761us       1.435us            96  
-                                            aten::clone         0.85%     185.548us         8.66%       1.892ms      19.710us       0.000us         0.00%     137.761us       1.435us            96  
-void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     137.761us         7.78%     137.761us       1.435us            96  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     129.798us         7.33%     129.798us       1.803us            72  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     120.063us         6.78%     120.063us       1.251us            96  
-                                              aten::add         1.83%     400.048us         3.06%     668.907us      11.148us     114.148us         6.44%     114.148us       1.902us            60  
-                                          aten::__and__         0.43%      94.485us         4.77%       1.041ms      12.390us       0.000us         0.00%     108.862us       1.296us            84  
-                                      aten::bitwise_and         2.65%     579.339us         4.33%     946.258us      11.265us     108.862us         6.15%     108.862us       1.296us            84  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     108.862us         6.15%     108.862us       1.296us            84  
-                                              aten::sub         2.45%     535.892us         4.10%     895.598us      12.439us      84.572us         4.77%      84.572us       1.175us            72  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us      19.412ms      1097.11%      19.412ms      19.412ms             1  
+                                            torch_eager        19.43%       4.188ms        99.97%      21.544ms      21.544ms       0.000us         0.00%       1.770ms       1.770ms             1  
+                                              aten::mul         5.88%       1.267ms        10.26%       2.212ms      11.521us     450.496us        25.46%     450.496us       2.346us           192  
+                                            aten::index         4.35%     938.379us        16.41%       3.536ms      73.661us     281.281us        15.90%     418.917us       8.727us            48  
+                                            aten::copy_         4.72%       1.017ms        12.00%       2.587ms      11.811us     371.333us        20.99%     371.333us       1.696us           219  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     355.809us        20.11%     355.809us       2.965us           120  
+void at::native::index_elementwise_kernel<128, 4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     281.281us        15.90%     281.281us       5.860us            48  
+                                               aten::to         0.57%     122.376us        11.15%       2.403ms      14.050us       0.000us         0.00%     233.697us       1.367us           171  
+                                         aten::_to_copy         1.79%     386.738us        10.58%       2.280ms      18.538us       0.000us         0.00%     233.697us       1.900us           123  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     167.937us         9.49%     167.937us       1.999us            84  
+                                       aten::contiguous         0.36%      77.297us         8.74%       1.884ms      19.624us       0.000us         0.00%     137.636us       1.434us            96  
+                                            aten::clone         0.72%     155.217us         8.38%       1.807ms      18.819us       0.000us         0.00%     137.636us       1.434us            96  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     137.636us         7.78%     137.636us       1.434us            96  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     130.211us         7.36%     130.211us       1.808us            72  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     119.940us         6.78%     119.940us       1.249us            96  
+                                              aten::add         1.56%     336.953us         2.72%     585.265us       9.754us     114.431us         6.47%     114.431us       1.907us            60  
+                                          aten::__and__         0.41%      88.309us         4.45%     959.250us      11.420us       0.000us         0.00%     108.994us       1.298us            84  
+                                      aten::bitwise_and         2.40%     517.417us         4.04%     870.941us      10.368us     108.994us         6.16%     108.994us       1.298us            84  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     108.994us         6.16%     108.994us       1.298us            84  
+                                              aten::sub         2.15%     464.219us         3.68%     792.358us      11.005us      84.546us         4.78%      84.546us       1.174us            72  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 21.838ms
-Self CUDA time total: 1.771ms
+Self CPU time total: 21.550ms
+Self CUDA time total: 1.769ms
 
 
 impl                     wl                  p50(ms)  ok
-torch_eager              cuda_B1_Q100_H8_E256_L4_P4     3.46  True
-torch_eager              cuda_B1_Q300_H8_E256_L4_P4     4.24  True
-torch_eager              cuda_B2_Q100_H8_E256_L4_P4     4.25  True
-torch_eager              cuda_B2_Q300_H8_E256_L4_P4     4.34  True
+torch_eager              cuda_B1_Q100_H8_E256_L4_P4     3.38  True
+torch_eager              cuda_B1_Q300_H8_E256_L4_P4     4.08  True
+torch_eager              cuda_B2_Q100_H8_E256_L4_P4     4.16  True
+torch_eager              cuda_B2_Q300_H8_E256_L4_P4     4.17  True
 
+
+
▶ UV Install Logs
+ +

Artifacts:

deformable_detr.jsonl diff --git a/deformable_detr/results/artifacts/combine/latency.svg b/deformable_detr/results/artifacts/combine/latency.svg index e3e32b63f54e8c9a4f463f301624b978638d2477..a879821c456233d808ff03473a00a8956b74435c 100644 --- a/deformable_detr/results/artifacts/combine/latency.svg +++ b/deformable_detr/results/artifacts/combine/latency.svg @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:88116c3810103702d4e4bca4659d09621c275dbe5bc24360506bd5c5adb84f9c -size 14871 +oid sha256:39bf256158907575092097d20bcc588a7fb4ce049cb7b107bfda5e17eb6307c7 +size 14865 diff --git a/deformable_detr/results/combined_results.html b/deformable_detr/results/combined_results.html index 00605e31a3c927b6e1e6dd2a79924c4542acefa8..7137aaadbc90bc5044b3785493eb7c2faa4bc91b 100644 --- a/deformable_detr/results/combined_results.html +++ b/deformable_detr/results/combined_results.html @@ -3889,7 +3889,7 @@ body[data-tool="eraser"] .main-content { - 2025-12-19T19:55:30.123615 + 2025-12-19T23:02:54.345828 image/svg+xml @@ -3973,70 +3973,70 @@ body[data-tool="eraser"] .main-content { - + - + - 0 + 0 - + - + - 1 + 1 - + - + - 2 + 2 - + - + - 3 + 3 - + - + - 4 + 4 @@ -4044,26 +4044,26 @@ body[data-tool="eraser"] .main-content { - + - - - + + + - + - - - + + + @@ -4122,7 +4122,7 @@ body[data-tool="eraser"] .main-content { ▼ output ▶ uv-logs | -Cell: combine | 4.63s +Cell: combine | 4.41s | Raw @@ -4210,13 +4210,13 @@ COMBINED BENCHMARK SUMMARY impl wl p50(ms) ok hf_kernels_deformable_detr cuda_B1_Q100_H8_E256_L4_P4 0.04 True -hf_kernels_deformable_detr cuda_B1_Q300_H8_E256_L4_P4 0.05 True -hf_kernels_deformable_detr cuda_B2_Q100_H8_E256_L4_P4 0.05 True +hf_kernels_deformable_detr cuda_B1_Q300_H8_E256_L4_P4 0.04 True +hf_kernels_deformable_detr cuda_B2_Q100_H8_E256_L4_P4 0.04 True hf_kernels_deformable_detr cuda_B2_Q300_H8_E256_L4_P4 0.05 True -torch_eager cuda_B1_Q100_H8_E256_L4_P4 3.46 True -torch_eager cuda_B1_Q300_H8_E256_L4_P4 4.24 True -torch_eager cuda_B2_Q100_H8_E256_L4_P4 4.25 True -torch_eager cuda_B2_Q300_H8_E256_L4_P4 4.34 True +torch_eager cuda_B1_Q100_H8_E256_L4_P4 3.38 True +torch_eager cuda_B1_Q300_H8_E256_L4_P4 4.08 True +torch_eager cuda_B2_Q100_H8_E256_L4_P4 4.16 True +torch_eager cuda_B2_Q300_H8_E256_L4_P4 4.17 True GENERATING COMBINED VISUALIZATION @@ -4236,7 +4236,7 @@ Implementations included:
▶ UV Install Logs
@@ -4249,7 +4249,7 @@ Installed 37 packages in 311ms - 2025-12-19T19:55:30.123615 + 2025-12-19T23:02:54.345828 image/svg+xml @@ -4333,70 +4333,70 @@ Installed 37 packages in 311ms - + - + - 0 + 0 - + - + - 1 + 1 - + - + - 2 + 2 - + - + - 3 + 3 - + - + - 4 + 4 @@ -4404,26 +4404,26 @@ Installed 37 packages in 311ms - + - - - + + + - + - - - + + + diff --git a/flash_attn/impls/artifacts/benchmark/attention.jsonl b/flash_attn/impls/artifacts/benchmark/attention.jsonl index 8336dd759b4d4f71197003544cf453e001ae2472..7c9bdc18dd58287062d54b646f8bc5ef4b65254a 100644 --- a/flash_attn/impls/artifacts/benchmark/attention.jsonl +++ b/flash_attn/impls/artifacts/benchmark/attention.jsonl @@ -1,6 +1,6 @@ -{"ts": "2025-12-19T19:55:13Z", "run": "e453bd1c3c404adca7ebbffbcb1899bf", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L128_bfloat16", "batch": 1, "seq_len": 4224, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.971173999914754, "p50": 0.9783339999103191, "p90": 0.9836439999162394, "mean": 0.9789179998733744, "iqr": 0.007710000090810354, "raw_times": [0.9783339999103191, 0.975933999825429, 0.9836439999162394, 0.9855039998001303, 0.971173999914754], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.0032949999185803, "peak_bytes": 295567360, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003604888916015625, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null} -{"ts": "2025-12-19T19:55:13Z", "run": "e453bd1c3c404adca7ebbffbcb1899bf", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L256_bfloat16", "batch": 1, "seq_len": 4352, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.0312540000541048, "p50": 1.039535000018077, "p90": 1.0408949999600736, "mean": 1.0369627999807562, "iqr": 0.00922000003811263, "raw_times": [1.031674999921961, 1.0408949999600736, 1.0414549999495648, 1.0312540000541048, 1.039535000018077], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.0439259999657224, "peak_bytes": 304742400, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null} -{"ts": "2025-12-19T19:55:14Z", "run": "e453bd1c3c404adca7ebbffbcb1899bf", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L320_bfloat16", "batch": 1, "seq_len": 4416, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.05486600000404, "p50": 1.0608159998355404, "p90": 1.0660549999101931, "mean": 1.062165799930881, "iqr": 0.010128999974767794, "raw_times": [1.0608159998355404, 1.0731659999692056, 1.0559259999354254, 1.05486600000404, 1.0660549999101931], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.0692559999370133, "peak_bytes": 307494912, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null} -{"ts": "2025-12-19T19:55:14Z", "run": "e453bd1c3c404adca7ebbffbcb1899bf", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L384_bfloat16", "batch": 1, "seq_len": 4480, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.0765559998162644, "p50": 1.0860869999760325, "p90": 1.0925159999715106, "mean": 1.0862464000183536, "iqr": 0.013049999779468635, "raw_times": [1.0860869999760325, 1.0925159999715106, 1.0765559998162644, 1.079466000192042, 1.0966070001359185], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.10497600007875, "peak_bytes": 311296000, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null} -{"ts": "2025-12-19T19:55:14Z", "run": "e453bd1c3c404adca7ebbffbcb1899bf", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L448_bfloat16", "batch": 1, "seq_len": 4544, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.247940999974162, "p50": 1.2629510001715971, "p90": 1.2655800001084572, "mean": 1.2603426000623585, "iqr": 0.014840000176263857, "raw_times": [1.2629510001715971, 1.247940999974162, 1.2655800001084572, 1.2507399999321933, 1.2745010001253831], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.2752009999985603, "peak_bytes": 315621376, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null} -{"ts": "2025-12-19T19:55:14Z", "run": "e453bd1c3c404adca7ebbffbcb1899bf", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L512_bfloat16", "batch": 1, "seq_len": 4608, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.228739999987738, "p50": 1.2448499999209162, "p90": 1.2651710001136962, "mean": 1.2494922000314546, "iqr": 0.028152000140835298, "raw_times": [1.228739999987738, 1.2448499999209162, 1.237018999972861, 1.2716810001620615, 1.2651710001136962], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.262461000123949, "peak_bytes": 319946752, "ok": true, "absmax": 0.125, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.125, "mae": 0.000362396240234375, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null} +{"ts": "2025-12-19T23:02:00Z", "run": "d08cdddcbd814f0a98850e99a3cc8f3c", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L128_bfloat16", "batch": 1, "seq_len": 4224, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.2105559999326942, "p50": 1.2135660001604265, "p90": 1.214856999922631, "mean": 1.213000200004899, "iqr": 0.0038309999581542797, "raw_times": [1.2149960000442661, 1.2110259999644768, 1.2105559999326942, 1.2135660001604265, 1.214856999922631], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.2067360000855842, "peak_bytes": 295567360, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null} +{"ts": "2025-12-19T23:02:00Z", "run": "d08cdddcbd814f0a98850e99a3cc8f3c", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L256_bfloat16", "batch": 1, "seq_len": 4352, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.2594280001394509, "p50": 1.2752780000937491, "p90": 1.2771070000781037, "mean": 1.2731776000691752, "iqr": 0.010640000027706265, "raw_times": [1.2752780000937491, 1.2664670000503975, 1.2771070000781037, 1.2594280001394509, 1.287607999984175], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.2718570001197804, "peak_bytes": 304742400, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null} +{"ts": "2025-12-19T23:02:00Z", "run": "d08cdddcbd814f0a98850e99a3cc8f3c", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L320_bfloat16", "batch": 1, "seq_len": 4416, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.2881479999578005, "p50": 1.2985280000066268, "p90": 1.2987470001917245, "mean": 1.2992600000416132, "iqr": 0.008449000233667903, "raw_times": [1.2902979999580566, 1.2881479999578005, 1.2985280000066268, 1.2987470001917245, 1.3205790000938578], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.2919179998789332, "peak_bytes": 307494912, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null} +{"ts": "2025-12-19T23:02:00Z", "run": "d08cdddcbd814f0a98850e99a3cc8f3c", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L384_bfloat16", "batch": 1, "seq_len": 4480, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.32487900009437, "p50": 1.3346000000638014, "p90": 1.337429000159318, "mean": 1.3341430000764376, "iqr": 0.006821000170020852, "raw_times": [1.32487900009437, 1.337429000159318, 1.3346000000638014, 1.3306079999892972, 1.3431990000754013], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.327048999883118, "peak_bytes": 311296000, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null} +{"ts": "2025-12-19T23:02:00Z", "run": "d08cdddcbd814f0a98850e99a3cc8f3c", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L448_bfloat16", "batch": 1, "seq_len": 4544, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.4795820000017557, "p50": 1.4878020001560799, "p90": 1.4919819998340245, "mean": 1.4892582000356924, "iqr": 0.004879999778495403, "raw_times": [1.4795820000017557, 1.4919819998340245, 1.487102000055529, 1.499823000131073, 1.4878020001560799], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.4706619999742543, "peak_bytes": 315621376, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null} +{"ts": "2025-12-19T23:02:00Z", "run": "d08cdddcbd814f0a98850e99a3cc8f3c", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L512_bfloat16", "batch": 1, "seq_len": 4608, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.507972999888807, "p50": 1.5174029999798222, "p90": 1.518043000032776, "mean": 1.5156109999679757, "iqr": 0.005300000111674308, "raw_times": [1.518043000032776, 1.5174029999798222, 1.5218930000173714, 1.507972999888807, 1.5127429999211017], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.517042999921614, "peak_bytes": 319946752, "ok": true, "absmax": 0.125, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.125, "mae": 0.0003566741943359375, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null} diff --git a/flash_attn/impls/cells/benchmark.py b/flash_attn/impls/cells/benchmark.py index 04ae262009c3d6e33aaa3e392d28c903f24c287c..8f163bdd918898ced9e858cd4197a85572d7ec8e 100644 --- a/flash_attn/impls/cells/benchmark.py +++ b/flash_attn/impls/cells/benchmark.py @@ -4,7 +4,6 @@ # "numpy", # "torch==2.8.0", # "kernels-benchmark-tools", -# "xformers", # ] # # [tool.uv.sources] @@ -13,18 +12,18 @@ import torch import sys from kernels_benchmark_tools import KernelTypeEnum, run_benchmark -import xformers.ops as xops -def xformers_attention(q, k, v): - """xFormers memory efficient attention""" - # xFormers expects [batch, seq_len, heads, head_dim] - return xops.memory_efficient_attention(q, k, v) +def torch_flash(q, k, v): + qt, kt, vt = (x.transpose(1, 2).contiguous() for x in (q, k, v)) + with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.FLASH_ATTENTION): + o = torch.nn.functional.scaled_dot_product_attention(qt, kt, vt) + return o.transpose(1, 2).contiguous() run_benchmark( kernel_type=KernelTypeEnum.ATTENTION, - impl_name="xformers_meff", - impl_tags={"family": "xformers", "backend": "memory_efficient", "compile": "none"}, - impl_func=xformers_attention, + impl_name="torch_flash_ma", + impl_tags={"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, + impl_func=torch_flash, ) \ No newline at end of file diff --git a/flash_attn/impls/flash_attention.html b/flash_attn/impls/flash_attention.html index 3dee29b24907b8d331d08c41698609879a2fdde4..5080aca810331a41d3b9d72925e47ca3f624666b 100644 --- a/flash_attn/impls/flash_attention.html +++ b/flash_attn/impls/flash_attention.html @@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content { ▼ output ▶ uv-logs | -Cell: nv | 0.25s +Cell: nv | 0.28s | Raw @@ -3905,7 +3905,7 @@ Cell: nv | 0.25s
-
Fri Dec 19 19:41:23 2025       
+
Fri Dec 19 23:02:01 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 580.105.08             Driver Version: 580.105.08     CUDA Version: 13.0     |
 +-----------------------------------------+------------------------+----------------------+
@@ -3914,7 +3914,7 @@ Cell: nv | 0.25s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   31C    P0            107W /  350W |       0MiB /  46068MiB |    100%      Default |
+| N/A   42C    P0             86W /  350W |       0MiB /  46068MiB |     20%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -3938,7 +3938,7 @@ Cell: nv | 0.25s
 ▼ output
  ▶ uv-logs
  | 
-Cell: benchmark | 4.12s
+Cell: benchmark | 4.27s
  | 
 
 Raw
@@ -3989,29 +3989,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L128_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       3.560ms       101.41%       3.560ms       3.560ms             1  
-                                         torch_flash_ma         6.12%     330.406us        49.12%       2.651ms       2.651ms       0.000us         0.00%       3.550ms       3.550ms             1  
-                     aten::scaled_dot_product_attention         0.76%      41.091us         4.12%     222.225us      74.075us       0.000us         0.00%       2.785ms     928.191us             3  
-              aten::_scaled_dot_product_flash_attention         0.57%      30.902us         3.36%     181.134us      60.378us       0.000us         0.00%       2.785ms     928.191us             3  
-                         aten::_flash_attention_forward         0.74%      39.881us         2.41%     130.323us      43.441us       2.785ms        79.34%       2.785ms     928.191us             3  
-void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       2.785ms        79.34%       2.785ms     928.191us             3  
-                                       aten::contiguous         0.24%      12.809us        37.68%       2.033ms     169.455us       0.000us         0.00%     765.791us      63.816us            12  
-                                            aten::clone         0.64%      34.521us        37.44%       2.021ms     168.387us       0.000us         0.00%     765.791us      63.816us            12  
-                                            aten::copy_         1.67%      90.094us        35.26%       1.903ms     158.570us     725.311us        20.66%     765.791us      63.816us            12  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     725.311us        20.66%     725.311us      60.443us            12  
-                                Activity Buffer Request        31.66%       1.709ms        31.66%       1.709ms       1.709ms      40.480us         1.15%      40.480us      40.480us             1  
-                                        aten::transpose         1.17%      63.269us         1.58%      85.140us       3.548us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.41%      21.871us         0.41%      21.871us       0.911us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.47%      25.421us         1.97%     106.322us       7.088us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.76%      94.971us         1.76%      94.971us       3.957us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         2.37%     128.144us         2.37%     128.144us       8.543us       0.000us         0.00%       0.000us       0.000us            15  
-                                    aten::empty_strided         0.32%      17.100us         0.32%      17.100us       5.700us       0.000us         0.00%       0.000us       0.000us             3  
-                                 cudaDeviceGetAttribute         0.04%       2.290us         0.04%       2.290us       0.382us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.18%       9.631us         0.18%       9.631us       3.210us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        50.88%       2.746ms        50.88%       2.746ms       2.746ms       0.000us         0.00%       0.000us       0.000us             1  
+                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       3.630ms       102.63%       3.630ms       3.630ms             1  
+                                         torch_flash_ma         6.38%     352.556us        49.11%       2.714ms       2.714ms       0.000us         0.00%       3.576ms       3.576ms             1  
+                     aten::scaled_dot_product_attention         0.73%      40.491us         3.98%     220.075us      73.358us       0.000us         0.00%       2.821ms     940.462us             3  
+              aten::_scaled_dot_product_flash_attention         0.47%      25.779us         3.25%     179.584us      59.861us       0.000us         0.00%       2.821ms     940.462us             3  
+                         aten::_flash_attention_forward         0.70%      38.829us         2.35%     129.692us      43.231us       2.821ms        79.77%       2.821ms     940.462us             3  
+void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       2.821ms        79.77%       2.821ms     940.462us             3  
+                                       aten::contiguous         0.22%      12.191us        37.53%       2.074ms     172.866us       0.000us         0.00%     755.108us      62.926us            12  
+                                            aten::clone         0.60%      33.381us        37.31%       2.062ms     171.850us       0.000us         0.00%     755.108us      62.926us            12  
+                                            aten::copy_         1.61%      89.181us        35.26%       1.949ms     162.385us     715.299us        20.23%     755.108us      62.926us            12  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     715.299us        20.23%     715.299us      59.608us            12  
+                                Activity Buffer Request        31.81%       1.758ms        31.81%       1.758ms       1.758ms      39.809us         1.13%      39.809us      39.809us             1  
+                                        aten::transpose         1.21%      66.774us         1.65%      91.006us       3.792us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.44%      24.232us         0.44%      24.232us       1.010us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.44%      24.459us         1.87%     103.512us       6.901us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.67%      92.213us         1.67%      92.213us       3.842us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         2.28%     126.282us         2.28%     126.282us       8.419us       0.000us         0.00%       0.000us       0.000us            15  
+                                    aten::empty_strided         0.31%      16.960us         0.31%      16.960us       5.653us       0.000us         0.00%       0.000us       0.000us             3  
+                                 cudaDeviceGetAttribute         0.04%       2.141us         0.04%       2.141us       0.357us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.19%      10.441us         0.19%      10.441us       3.480us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        50.89%       2.813ms        50.89%       2.813ms       2.813ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 5.397ms
-Self CUDA time total: 3.510ms
+Self CPU time total: 5.527ms
+Self CUDA time total: 3.537ms
 
 
 
@@ -4021,29 +4021,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L256_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                         torch_flash_ma         4.59%     254.063us        44.59%       2.468ms       2.468ms       0.000us         0.00%       3.765ms       3.765ms             1  
-                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       3.721ms       100.30%       3.721ms       3.721ms             1  
-                     aten::scaled_dot_product_attention         0.43%      23.691us         3.30%     182.385us      60.795us       0.000us         0.00%       2.950ms     983.280us             3  
-              aten::_scaled_dot_product_flash_attention         0.32%      17.969us         2.87%     158.694us      52.898us       0.000us         0.00%       2.950ms     983.280us             3  
-                         aten::_flash_attention_forward         0.74%      40.930us         2.17%     120.223us      40.074us       2.950ms        79.52%       2.950ms     983.280us             3  
-void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       2.950ms        79.52%       2.950ms     983.280us             3  
-                                       aten::contiguous         0.16%       8.922us        35.94%       1.989ms     165.775us       0.000us         0.00%     815.354us      67.946us            12  
-                                            aten::clone         0.46%      25.650us        35.78%       1.980ms     165.031us       0.000us         0.00%     815.354us      67.946us            12  
-                                            aten::copy_         1.41%      78.081us        34.18%       1.891ms     157.619us     759.770us        20.48%     815.354us      67.946us            12  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     759.770us        20.48%     759.770us      63.314us            12  
-                                Activity Buffer Request        31.33%       1.734ms        31.33%       1.734ms       1.734ms      55.584us         1.50%      55.584us      55.584us             1  
-                                        aten::transpose         0.84%      46.272us         1.13%      62.592us       2.608us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.29%      16.320us         0.29%      16.320us       0.680us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.39%      21.392us         1.49%      82.711us       5.514us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.42%      78.721us         1.42%      78.721us       3.280us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         1.84%     101.714us         1.84%     101.714us       6.781us       0.000us         0.00%       0.000us       0.000us            15  
-                                    aten::empty_strided         0.25%      13.930us         0.25%      13.930us       4.643us       0.000us         0.00%       0.000us       0.000us             3  
-                                 cudaDeviceGetAttribute         0.03%       1.700us         0.03%       1.700us       0.283us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.08%       4.360us         0.08%       4.360us       1.453us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        55.41%       3.067ms        55.41%       3.067ms       3.067ms       0.000us         0.00%       0.000us       0.000us             1  
+                                         torch_flash_ma         4.56%     252.356us        44.41%       2.457ms       2.457ms       0.000us         0.00%       3.793ms       3.793ms             1  
+                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       3.748ms       100.29%       3.748ms       3.748ms             1  
+                     aten::scaled_dot_product_attention         0.44%      24.090us         3.37%     186.293us      62.098us       0.000us         0.00%       2.975ms     991.820us             3  
+              aten::_scaled_dot_product_flash_attention         0.34%      18.721us         2.93%     162.203us      54.068us       0.000us         0.00%       2.975ms     991.820us             3  
+                         aten::_flash_attention_forward         0.77%      42.568us         2.18%     120.522us      40.174us       2.975ms        79.63%       2.975ms     991.820us             3  
+void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       2.975ms        79.63%       2.975ms     991.820us             3  
+                                       aten::contiguous         0.18%       9.899us        35.65%       1.973ms     164.423us       0.000us         0.00%     817.633us      68.136us            12  
+                                            aten::clone         0.53%      29.604us        35.48%       1.963ms     163.598us       0.000us         0.00%     817.633us      68.136us            12  
+                                            aten::copy_         1.46%      80.732us        33.77%       1.869ms     155.723us     761.377us        20.37%     817.633us      68.136us            12  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     761.377us        20.37%     761.377us      63.448us            12  
+                                Activity Buffer Request        30.82%       1.705ms        30.82%       1.705ms       1.705ms      56.256us         1.51%      56.256us      56.256us             1  
+                                        aten::transpose         0.91%      50.232us         1.24%      68.680us       2.862us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.33%      18.448us         0.33%      18.448us       0.769us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.42%      23.239us         1.52%      84.240us       5.616us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.37%      76.011us         1.37%      76.011us       3.167us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         1.93%     106.693us         1.93%     106.693us       7.113us       0.000us         0.00%       0.000us       0.000us            15  
+                                    aten::empty_strided         0.25%      13.951us         0.25%      13.951us       4.650us       0.000us         0.00%       0.000us       0.000us             3  
+                                 cudaDeviceGetAttribute         0.03%       1.720us         0.03%       1.720us       0.287us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.07%       3.701us         0.07%       3.701us       1.234us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        55.59%       3.076ms        55.59%       3.076ms       3.076ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
 Self CPU time total: 5.534ms
-Self CUDA time total: 3.710ms
+Self CUDA time total: 3.737ms
 
 
 
@@ -4053,29 +4053,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L320_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                         torch_flash_ma         4.62%     254.756us        44.14%       2.433ms       2.433ms       0.000us         0.00%       3.774ms       3.774ms             1  
-                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       3.727ms       100.29%       3.727ms       3.727ms             1  
-                     aten::scaled_dot_product_attention         0.43%      23.830us         3.33%     183.454us      61.151us       0.000us         0.00%       2.942ms     980.796us             3  
-              aten::_scaled_dot_product_flash_attention         0.32%      17.891us         2.90%     159.624us      53.208us       0.000us         0.00%       2.942ms     980.796us             3  
-                         aten::_flash_attention_forward         0.73%      40.074us         2.20%     121.152us      40.384us       2.942ms        79.17%       2.942ms     980.796us             3  
-void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       2.942ms        79.17%       2.942ms     980.796us             3  
-                                       aten::contiguous         0.16%       8.718us        35.43%       1.953ms     162.745us       0.000us         0.00%     831.581us      69.298us            12  
-                                            aten::clone         0.47%      25.749us        35.27%       1.944ms     162.019us       0.000us         0.00%     831.581us      69.298us            12  
-                                            aten::copy_         1.40%      77.041us        33.64%       1.855ms     154.552us     774.142us        20.83%     831.581us      69.298us            12  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     774.142us        20.83%     774.142us      64.512us            12  
-                                Activity Buffer Request        30.83%       1.700ms        30.83%       1.700ms       1.700ms      57.439us         1.55%      57.439us      57.439us             1  
-                                        aten::transpose         0.84%      46.360us         1.13%      62.482us       2.603us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.29%      16.122us         0.29%      16.122us       0.672us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.36%      19.611us         1.53%      84.374us       5.625us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.44%      79.561us         1.44%      79.561us       3.315us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         1.87%     102.913us         1.87%     102.913us       6.861us       0.000us         0.00%       0.000us       0.000us            15  
-                                    aten::empty_strided         0.28%      15.330us         0.28%      15.330us       5.110us       0.000us         0.00%       0.000us       0.000us             3  
-                                 cudaDeviceGetAttribute         0.03%       1.680us         0.03%       1.680us       0.280us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.07%       3.840us         0.07%       3.840us       1.280us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        55.86%       3.080ms        55.86%       3.080ms       3.080ms       0.000us         0.00%       0.000us       0.000us             1  
+                                         torch_flash_ma         4.60%     257.767us        43.91%       2.459ms       2.459ms       0.000us         0.00%       3.868ms       3.868ms             1  
+                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       3.820ms       100.28%       3.820ms       3.820ms             1  
+                     aten::scaled_dot_product_attention         0.42%      23.451us         3.31%     185.194us      61.731us       0.000us         0.00%       3.025ms       1.008ms             3  
+              aten::_scaled_dot_product_flash_attention         0.35%      19.728us         2.89%     161.743us      53.914us       0.000us         0.00%       3.025ms       1.008ms             3  
+                         aten::_flash_attention_forward         0.72%      40.171us         2.13%     119.133us      39.711us       3.025ms        79.42%       3.025ms       1.008ms             3  
+void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       3.025ms        79.42%       3.025ms       1.008ms             3  
+                                       aten::contiguous         0.17%       9.680us        35.16%       1.969ms     164.068us       0.000us         0.00%     843.394us      70.283us            12  
+                                            aten::clone         0.57%      32.118us        34.99%       1.959ms     163.261us       0.000us         0.00%     843.394us      70.283us            12  
+                                            aten::copy_         1.44%      80.682us        33.24%       1.861ms     155.084us     783.938us        20.58%     843.394us      70.283us            12  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     783.938us        20.58%     783.938us      65.328us            12  
+                                Activity Buffer Request        30.29%       1.696ms        30.29%       1.696ms       1.696ms      59.456us         1.56%      59.456us      59.456us             1  
+                                        aten::transpose         0.92%      51.272us         1.25%      69.843us       2.910us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.33%      18.571us         0.33%      18.571us       0.774us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.37%      20.823us         1.56%      87.172us       5.811us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.42%      79.691us         1.42%      79.691us       3.320us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         1.92%     107.532us         1.92%     107.532us       7.169us       0.000us         0.00%       0.000us       0.000us            15  
+                                    aten::empty_strided         0.28%      15.890us         0.28%      15.890us       5.297us       0.000us         0.00%       0.000us       0.000us             3  
+                                 cudaDeviceGetAttribute         0.03%       1.700us         0.03%       1.700us       0.283us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.07%       3.820us         0.07%       3.820us       1.273us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        56.09%       3.140ms        56.09%       3.140ms       3.140ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 5.513ms
-Self CUDA time total: 3.717ms
+Self CPU time total: 5.599ms
+Self CUDA time total: 3.809ms
 
 
 
@@ -4085,29 +4085,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L384_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                         torch_flash_ma         4.28%     249.055us        45.91%       2.672ms       2.672ms       0.000us         0.00%       3.870ms       3.870ms             1  
-                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       3.822ms       100.28%       3.822ms       3.822ms             1  
-                     aten::scaled_dot_product_attention         0.44%      25.342us         3.23%     187.955us      62.652us       0.000us         0.00%       3.022ms       1.007ms             3  
-              aten::_scaled_dot_product_flash_attention         0.30%      17.701us         2.79%     162.613us      54.204us       0.000us         0.00%       3.022ms       1.007ms             3  
-                         aten::_flash_attention_forward         0.71%      41.280us         2.11%     122.541us      40.847us       3.022ms        79.29%       3.022ms       1.007ms             3  
-void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       3.022ms        79.29%       3.022ms       1.007ms             3  
-                                       aten::contiguous         0.16%       9.081us        37.65%       2.191ms     182.597us       0.000us         0.00%     847.483us      70.624us            12  
-                                            aten::clone         0.47%      27.546us        37.50%       2.182ms     181.840us       0.000us         0.00%     847.483us      70.624us            12  
-                                            aten::copy_         1.40%      81.736us        35.91%       2.090ms     174.156us     789.211us        20.71%     847.483us      70.624us            12  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     789.211us        20.71%     789.211us      65.768us            12  
-                                Activity Buffer Request        29.46%       1.714ms        29.46%       1.714ms       1.714ms      58.272us         1.53%      58.272us      58.272us             1  
-                                        aten::transpose         0.83%      48.521us         1.13%      65.981us       2.749us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.30%      17.460us         0.30%      17.460us       0.727us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.35%      20.461us         1.45%      84.343us       5.623us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.38%      80.070us         1.38%      80.070us       3.336us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         5.47%     318.217us         5.47%     318.217us      21.214us       0.000us         0.00%       0.000us       0.000us            15  
-                                    aten::empty_strided         0.25%      14.521us         0.25%      14.521us       4.840us       0.000us         0.00%       0.000us       0.000us             3  
-                                 cudaDeviceGetAttribute         0.03%       1.689us         0.03%       1.689us       0.282us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.08%       4.671us         0.08%       4.671us       1.557us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        54.09%       3.147ms        54.09%       3.147ms       3.147ms       0.000us         0.00%       0.000us       0.000us             1  
+                                         torch_flash_ma         4.31%     257.497us        46.49%       2.779ms       2.779ms       0.000us         0.00%       3.937ms       3.937ms             1  
+                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       3.890ms       100.28%       3.890ms       3.890ms             1  
+                     aten::scaled_dot_product_attention         0.42%      25.301us         3.15%     188.584us      62.861us       0.000us         0.00%       3.098ms       1.033ms             3  
+              aten::_scaled_dot_product_flash_attention         0.34%      20.249us         2.73%     163.283us      54.428us       0.000us         0.00%       3.098ms       1.033ms             3  
+                         aten::_flash_attention_forward         0.67%      40.000us         1.99%     118.763us      39.588us       3.098ms        79.85%       3.098ms       1.033ms             3  
+void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       3.098ms        79.85%       3.098ms       1.033ms             3  
+                                       aten::contiguous         0.17%      10.243us        38.20%       2.284ms     190.292us       0.000us         0.00%     838.882us      69.907us            12  
+                                            aten::clone         0.53%      31.478us        38.03%       2.273ms     189.439us       0.000us         0.00%     838.882us      69.907us            12  
+                                            aten::copy_         1.35%      80.860us        36.38%       2.175ms     181.246us     781.730us        20.15%     838.882us      69.907us            12  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     781.730us        20.15%     781.730us      65.144us            12  
+                                Activity Buffer Request        30.50%       1.823ms        30.50%       1.823ms       1.823ms      57.152us         1.47%      57.152us      57.152us             1  
+                                        aten::transpose         0.90%      53.920us         1.24%      74.061us       3.086us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.34%      20.141us         0.34%      20.141us       0.839us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.36%      21.362us         1.47%      87.614us       5.841us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.34%      79.813us         1.34%      79.813us       3.326us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         4.91%     293.806us         4.91%     293.806us      19.587us       0.000us         0.00%       0.000us       0.000us            15  
+                                    aten::empty_strided         0.26%      15.670us         0.26%      15.670us       5.223us       0.000us         0.00%       0.000us       0.000us             3  
+                                 cudaDeviceGetAttribute         0.03%       1.659us         0.03%       1.659us       0.276us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.07%       3.921us         0.07%       3.921us       1.307us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        53.51%       3.199ms        53.51%       3.199ms       3.199ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 5.819ms
-Self CUDA time total: 3.811ms
+Self CPU time total: 5.978ms
+Self CUDA time total: 3.880ms
 
 
 
@@ -4117,29 +4117,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L448_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                         torch_flash_ma         4.79%     300.628us        43.01%       2.699ms       2.699ms       0.000us         0.00%       4.340ms       4.340ms             1  
-                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       4.290ms       100.25%       4.290ms       4.290ms             1  
-                     aten::scaled_dot_product_attention         0.40%      25.381us         2.96%     185.704us      61.901us       0.000us         0.00%       3.474ms       1.158ms             3  
-              aten::_scaled_dot_product_flash_attention         0.28%      17.780us         2.55%     160.323us      53.441us       0.000us         0.00%       3.474ms       1.158ms             3  
-                         aten::_flash_attention_forward         0.64%      40.370us         1.93%     121.223us      40.408us       3.474ms        81.17%       3.474ms       1.158ms             3  
-void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       3.474ms        81.17%       3.474ms       1.158ms             3  
-                                       aten::contiguous         0.14%       9.022us        34.56%       2.169ms     180.719us       0.000us         0.00%     866.336us      72.195us            12  
-                                            aten::clone         0.44%      27.858us        34.41%       2.160ms     179.967us       0.000us         0.00%     866.336us      72.195us            12  
-                                            aten::copy_         1.24%      77.719us        32.91%       2.066ms     172.130us     806.048us        18.83%     866.336us      72.195us            12  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     806.048us        18.83%     806.048us      67.171us            12  
-                                Activity Buffer Request        27.70%       1.738ms        27.70%       1.738ms       1.738ms      60.288us         1.41%      60.288us      60.288us             1  
-                                        aten::transpose         0.77%      48.240us         1.05%      65.650us       2.735us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.28%      17.410us         0.28%      17.410us       0.725us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.34%      21.363us         1.38%      86.453us       5.764us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.28%      80.561us         1.28%      80.561us       3.357us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         4.36%     273.888us         4.36%     273.888us      18.259us       0.000us         0.00%       0.000us       0.000us            15  
-                                    aten::empty_strided         0.24%      14.900us         0.24%      14.900us       4.967us       0.000us         0.00%       0.000us       0.000us             3  
-                                 cudaDeviceGetAttribute         0.03%       1.700us         0.03%       1.700us       0.283us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.07%       4.100us         0.07%       4.100us       1.367us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        56.99%       3.576ms        56.99%       3.576ms       3.576ms       0.000us         0.00%       0.000us       0.000us             1  
+                                         torch_flash_ma         4.81%     305.765us        42.59%       2.710ms       2.710ms       0.000us         0.00%       4.451ms       4.451ms             1  
+                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       4.400ms       100.25%       4.400ms       4.400ms             1  
+                     aten::scaled_dot_product_attention         0.38%      24.020us         2.97%     188.924us      62.975us       0.000us         0.00%       3.579ms       1.193ms             3  
+              aten::_scaled_dot_product_flash_attention         0.31%      19.571us         2.59%     164.904us      54.968us       0.000us         0.00%       3.579ms       1.193ms             3  
+                         aten::_flash_attention_forward         0.68%      43.108us         1.92%     122.012us      40.671us       3.579ms        81.54%       3.579ms       1.193ms             3  
+void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       3.579ms        81.54%       3.579ms       1.193ms             3  
+                                       aten::contiguous         0.15%       9.589us        34.07%       2.168ms     180.670us       0.000us         0.00%     871.616us      72.635us            12  
+                                            aten::clone         0.54%      34.360us        33.92%       2.158ms     179.871us       0.000us         0.00%     871.616us      72.635us            12  
+                                            aten::copy_         1.33%      84.914us        32.32%       2.057ms     171.390us     810.495us        18.46%     871.616us      72.635us            12  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     810.495us        18.46%     810.495us      67.541us            12  
+                                Activity Buffer Request        26.57%       1.691ms        26.57%       1.691ms       1.691ms      61.121us         1.39%      61.121us      61.121us             1  
+                                        aten::transpose         0.82%      51.874us         1.12%      70.963us       2.957us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.30%      19.089us         0.30%      19.089us       0.795us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.34%      21.431us         1.39%      88.502us       5.900us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.27%      80.674us         1.27%      80.674us       3.361us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         4.78%     304.046us         4.78%     304.046us      20.270us       0.000us         0.00%       0.000us       0.000us            15  
+                                    aten::empty_strided         0.25%      15.780us         0.25%      15.780us       5.260us       0.000us         0.00%       0.000us       0.000us             3  
+                                 cudaDeviceGetAttribute         0.02%       1.550us         0.02%       1.550us       0.258us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.06%       3.750us         0.06%       3.750us       1.250us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        57.41%       3.653ms        57.41%       3.653ms       3.653ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 6.275ms
-Self CUDA time total: 4.280ms
+Self CPU time total: 6.363ms
+Self CUDA time total: 4.389ms
 
 
 
@@ -4149,38 +4149,38 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L512_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                         torch_flash_ma         4.01%     253.526us        41.16%       2.602ms       2.602ms       0.000us         0.00%       4.429ms       4.429ms             1  
-                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       4.378ms       100.24%       4.378ms       4.378ms             1  
-                     aten::scaled_dot_product_attention         0.38%      23.889us         2.89%     182.483us      60.828us       0.000us         0.00%       3.556ms       1.185ms             3  
-              aten::_scaled_dot_product_flash_attention         0.27%      17.360us         2.51%     158.594us      52.865us       0.000us         0.00%       3.556ms       1.185ms             3  
-                         aten::_flash_attention_forward         0.66%      42.013us         1.90%     120.422us      40.141us       3.556ms        81.42%       3.556ms       1.185ms             3  
-void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       3.556ms        81.42%       3.556ms       1.185ms             3  
-                                       aten::contiguous         0.14%       8.630us        33.58%       2.122ms     176.875us       0.000us         0.00%     872.667us      72.722us            12  
-                                            aten::clone         0.41%      26.047us        33.44%       2.114ms     176.156us       0.000us         0.00%     872.667us      72.722us            12  
-                                            aten::copy_         1.25%      79.082us        32.00%       2.023ms     168.597us     811.483us        18.58%     872.667us      72.722us            12  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     811.483us        18.58%     811.483us      67.624us            12  
-                                Activity Buffer Request        26.87%       1.699ms        26.87%       1.699ms       1.699ms      61.184us         1.40%      61.184us      61.184us             1  
-                                        aten::transpose         0.75%      47.653us         1.02%      64.533us       2.689us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.27%      16.880us         0.27%      16.880us       0.703us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.33%      20.879us         1.34%      84.642us       5.643us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.25%      79.031us         1.25%      79.031us       3.293us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         4.24%     268.168us         4.24%     268.168us      17.878us       0.000us         0.00%       0.000us       0.000us            15  
-                                    aten::empty_strided         0.23%      14.592us         0.23%      14.592us       4.864us       0.000us         0.00%       0.000us       0.000us             3  
-                                 cudaDeviceGetAttribute         0.03%       1.679us         0.03%       1.679us       0.280us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.06%       3.920us         0.06%       3.920us       1.307us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        58.84%       3.719ms        58.84%       3.719ms       3.719ms       0.000us         0.00%       0.000us       0.000us             1  
+                                         torch_flash_ma         3.57%     230.352us        40.90%       2.641ms       2.641ms       0.000us         0.00%       4.540ms       4.540ms             1  
+                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       4.489ms       100.24%       4.489ms       4.489ms             1  
+                     aten::scaled_dot_product_attention         0.38%      24.551us         2.77%     178.785us      59.595us       0.000us         0.00%       3.667ms       1.222ms             3  
+              aten::_scaled_dot_product_flash_attention         0.30%      19.129us         2.39%     154.234us      51.411us       0.000us         0.00%       3.667ms       1.222ms             3  
+                         aten::_flash_attention_forward         0.55%      35.197us         1.71%     110.631us      36.877us       3.667ms        81.88%       3.667ms       1.222ms             3  
+void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       3.667ms        81.88%       3.667ms       1.222ms             3  
+                                       aten::contiguous         0.16%      10.271us        33.78%       2.181ms     181.772us       0.000us         0.00%     873.057us      72.755us            12  
+                                            aten::clone         0.44%      28.652us        33.62%       2.171ms     180.916us       0.000us         0.00%     873.057us      72.755us            12  
+                                            aten::copy_         1.30%      83.713us        32.19%       2.078ms     173.208us     811.457us        18.12%     873.057us      72.755us            12  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     811.457us        18.12%     811.457us      67.621us            12  
+                                Activity Buffer Request        26.83%       1.733ms        26.83%       1.733ms       1.733ms      61.600us         1.38%      61.600us      61.600us             1  
+                                        aten::transpose         0.85%      55.123us         1.17%      75.416us       3.142us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.31%      20.293us         0.31%      20.293us       0.846us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.30%      19.350us         1.29%      83.431us       5.562us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.21%      78.153us         1.21%      78.153us       3.256us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         4.40%     284.286us         4.40%     284.286us      18.952us       0.000us         0.00%       0.000us       0.000us            15  
+                                    aten::empty_strided         0.23%      14.650us         0.23%      14.650us       4.883us       0.000us         0.00%       0.000us       0.000us             3  
+                                 cudaDeviceGetAttribute         0.03%       1.640us         0.03%       1.640us       0.273us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.05%       3.450us         0.05%       3.450us       1.150us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        59.10%       3.816ms        59.10%       3.816ms       3.816ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 6.322ms
-Self CUDA time total: 4.367ms
+Self CPU time total: 6.458ms
+Self CUDA time total: 4.478ms
 
 
 impl                     wl                  p50(ms)  ok
 torch_flash_ma           cuda_attn_L128_bfloat16     1.21  True
-torch_flash_ma           cuda_attn_L256_bfloat16     1.25  True
-torch_flash_ma           cuda_attn_L320_bfloat16     1.28  True
-torch_flash_ma           cuda_attn_L384_bfloat16     1.31  True
-torch_flash_ma           cuda_attn_L448_bfloat16     1.45  True
-torch_flash_ma           cuda_attn_L512_bfloat16     1.49  True
+torch_flash_ma           cuda_attn_L256_bfloat16     1.28  True
+torch_flash_ma           cuda_attn_L320_bfloat16     1.30  True
+torch_flash_ma           cuda_attn_L384_bfloat16     1.33  True
+torch_flash_ma           cuda_attn_L448_bfloat16     1.49  True
+torch_flash_ma           cuda_attn_L512_bfloat16     1.52  True
 

Artifacts:

diff --git a/flash_attn/impls/hf_kernels_flash_attn.html b/flash_attn/impls/hf_kernels_flash_attn.html index 75838f1cf08b0b4997aa1abbe76b50e4fd1152c2..be4cd4d2141274ee46091e7676725fd19ad3f2a2 100644 --- a/flash_attn/impls/hf_kernels_flash_attn.html +++ b/flash_attn/impls/hf_kernels_flash_attn.html @@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content { ▼ output ▶ uv-logs | -Cell: benchmark | 5.91s +Cell: benchmark | 5.83s | Raw @@ -3943,21 +3943,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L128_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_flash_attn 3.35% 155.232us 44.97% 2.082ms 2.082ms 0.000us 0.00% 3.704ms 3.704ms 1 - _flash_attn_9e27194::fwd 1.43% 66.152us 41.62% 1.927ms 642.264us 2.766ms 100.00% 3.704ms 1.235ms 3 - hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.768ms 100.06% 2.768ms 2.768ms 1 -void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.766ms 100.00% 2.766ms 922.153us 3 - Activity Buffer Request 37.12% 1.719ms 37.12% 1.719ms 1.719ms 937.630us 33.89% 937.630us 937.630us 1 - cudaDeviceGetAttribute 0.12% 5.360us 0.12% 5.360us 0.357us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty_like 0.39% 18.222us 1.18% 54.592us 18.197us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 0.79% 36.370us 0.79% 36.370us 12.123us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty 0.56% 25.741us 0.56% 25.741us 2.860us 0.000us 0.00% 0.000us 0.000us 9 - cudaFuncSetAttribute 0.30% 13.770us 0.30% 13.770us 4.590us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 0.92% 42.401us 0.92% 42.401us 14.134us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 55.03% 2.548ms 55.03% 2.548ms 2.548ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_flash_attn 3.32% 153.894us 44.44% 2.062ms 2.062ms 0.000us 0.00% 3.741ms 3.741ms 1 + _flash_attn_9e27194::fwd 1.40% 65.047us 41.12% 1.908ms 636.067us 2.793ms 100.00% 3.741ms 1.247ms 3 + hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.795ms 100.05% 2.795ms 2.795ms 1 +void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.793ms 100.00% 2.793ms 931.053us 3 + Activity Buffer Request 36.76% 1.706ms 36.76% 1.706ms 1.706ms 947.811us 33.93% 947.811us 947.811us 1 + cudaDeviceGetAttribute 0.09% 4.281us 0.09% 4.281us 0.285us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty_like 0.44% 20.280us 1.17% 54.161us 18.054us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 0.73% 33.881us 0.73% 33.881us 11.294us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty 0.53% 24.740us 0.53% 24.740us 2.749us 0.000us 0.00% 0.000us 0.000us 9 + cudaFuncSetAttribute 0.29% 13.452us 0.29% 13.452us 4.484us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 0.87% 40.582us 0.87% 40.582us 13.527us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 55.56% 2.579ms 55.56% 2.579ms 2.579ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 4.630ms -Self CUDA time total: 2.766ms +Self CPU time total: 4.641ms +Self CUDA time total: 2.793ms @@ -3967,21 +3967,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L256_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_flash_attn 1.95% 91.533us 41.78% 1.962ms 1.962ms 0.000us 0.00% 3.856ms 3.856ms 1 - _flash_attn_9e27194::fwd 1.04% 49.050us 39.83% 1.870ms 623.350us 2.882ms 100.00% 3.856ms 1.285ms 3 - hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.884ms 100.05% 2.884ms 2.884ms 1 -void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.882ms 100.00% 2.882ms 960.764us 3 - Activity Buffer Request 36.88% 1.732ms 36.88% 1.732ms 1.732ms 973.756us 33.78% 973.756us 973.756us 1 - cudaDeviceGetAttribute 0.09% 4.030us 0.09% 4.030us 0.269us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty_like 0.18% 8.460us 0.61% 28.490us 9.497us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 0.43% 20.030us 0.43% 20.030us 6.677us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty 0.55% 25.961us 0.55% 25.961us 2.885us 0.000us 0.00% 0.000us 0.000us 9 - cudaFuncSetAttribute 0.08% 3.700us 0.08% 3.700us 1.233us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 0.58% 27.091us 0.58% 27.091us 9.030us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 58.22% 2.734ms 58.22% 2.734ms 2.734ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_flash_attn 1.87% 88.452us 41.15% 1.950ms 1.950ms 0.000us 0.00% 3.925ms 3.925ms 1 + _flash_attn_9e27194::fwd 0.93% 44.030us 39.28% 1.861ms 620.420us 2.932ms 100.00% 3.925ms 1.308ms 3 + hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.933ms 100.05% 2.933ms 2.933ms 1 +void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.932ms 100.00% 2.932ms 977.209us 3 + Activity Buffer Request 36.67% 1.738ms 36.67% 1.738ms 1.738ms 993.604us 33.89% 993.604us 993.604us 1 + cudaDeviceGetAttribute 0.08% 3.589us 0.08% 3.589us 0.239us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty_like 0.16% 7.361us 0.48% 22.851us 7.617us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 0.33% 15.490us 0.33% 15.490us 5.163us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty 0.44% 21.020us 0.44% 21.020us 2.336us 0.000us 0.00% 0.000us 0.000us 9 + cudaFuncSetAttribute 0.07% 3.450us 0.07% 3.450us 1.150us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 0.60% 28.443us 0.60% 28.443us 9.481us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 58.85% 2.789ms 58.85% 2.789ms 2.789ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 4.695ms -Self CUDA time total: 2.882ms +Self CPU time total: 4.739ms +Self CUDA time total: 2.932ms @@ -3991,21 +3991,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L320_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_flash_attn 2.18% 107.861us 40.50% 2.008ms 2.008ms 0.000us 0.00% 4.125ms 4.125ms 1 - _flash_attn_9e27194::fwd 0.99% 48.872us 38.32% 1.900ms 633.314us 3.094ms 100.00% 4.125ms 1.375ms 3 - hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.095ms 100.05% 3.095ms 3.095ms 1 -void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.094ms 100.00% 3.094ms 1.031ms 3 - Activity Buffer Request 35.67% 1.768ms 35.67% 1.768ms 1.768ms 1.032ms 33.35% 1.032ms 1.032ms 1 - cudaDeviceGetAttribute 0.09% 4.480us 0.09% 4.480us 0.299us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty_like 0.13% 6.580us 0.45% 22.520us 7.507us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 0.32% 15.940us 0.32% 15.940us 5.313us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty 0.47% 23.250us 0.47% 23.250us 2.583us 0.000us 0.00% 0.000us 0.000us 9 - cudaFuncSetAttribute 0.08% 3.791us 0.08% 3.791us 1.264us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 0.58% 28.541us 0.58% 28.541us 9.514us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 59.50% 2.950ms 59.50% 2.950ms 2.950ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_flash_attn 2.16% 105.271us 40.16% 1.954ms 1.954ms 0.000us 0.00% 4.088ms 4.088ms 1 + _flash_attn_9e27194::fwd 0.92% 44.671us 38.00% 1.849ms 616.384us 3.054ms 100.00% 4.088ms 1.363ms 3 + hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.056ms 100.05% 3.056ms 3.056ms 1 +void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.054ms 100.00% 3.054ms 1.018ms 3 + Activity Buffer Request 35.43% 1.724ms 35.43% 1.724ms 1.724ms 1.034ms 33.84% 1.034ms 1.034ms 1 + cudaDeviceGetAttribute 0.08% 3.741us 0.08% 3.741us 0.249us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty_like 0.15% 7.380us 0.46% 22.580us 7.527us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 0.31% 15.200us 0.31% 15.200us 5.067us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty 0.43% 20.900us 0.43% 20.900us 2.322us 0.000us 0.00% 0.000us 0.000us 9 + cudaFuncSetAttribute 0.07% 3.441us 0.07% 3.441us 1.147us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 0.61% 29.670us 0.61% 29.670us 9.890us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 59.84% 2.912ms 59.84% 2.912ms 2.912ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 4.958ms -Self CUDA time total: 3.094ms +Self CPU time total: 4.867ms +Self CUDA time total: 3.054ms @@ -4015,21 +4015,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L384_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_flash_attn 2.18% 109.362us 41.99% 2.109ms 2.109ms 0.000us 0.00% 4.102ms 4.102ms 1 - _flash_attn_9e27194::fwd 1.01% 50.650us 39.81% 1.999ms 666.498us 3.061ms 100.00% 4.102ms 1.367ms 3 - hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.062ms 100.05% 3.062ms 3.062ms 1 -void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.061ms 100.00% 3.061ms 1.020ms 3 - Activity Buffer Request 33.41% 1.678ms 33.41% 1.678ms 1.678ms 1.041ms 34.02% 1.041ms 1.041ms 1 - cudaDeviceGetAttribute 0.08% 4.070us 0.08% 4.070us 0.271us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty_like 0.14% 6.851us 0.49% 24.381us 8.127us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 0.35% 17.530us 0.35% 17.530us 5.843us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty 0.44% 22.140us 0.44% 22.140us 2.460us 0.000us 0.00% 0.000us 0.000us 9 - cudaFuncSetAttribute 0.08% 3.810us 0.08% 3.810us 1.270us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 4.31% 216.396us 4.31% 216.396us 72.132us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 58.01% 2.914ms 58.01% 2.914ms 2.914ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_flash_attn 1.99% 101.304us 41.40% 2.105ms 2.105ms 0.000us 0.00% 4.182ms 4.182ms 1 + _flash_attn_9e27194::fwd 0.90% 45.720us 39.41% 2.004ms 667.947us 3.124ms 100.00% 4.182ms 1.394ms 3 + hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.125ms 100.05% 3.125ms 3.125ms 1 +void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.124ms 100.00% 3.124ms 1.041ms 3 + Activity Buffer Request 33.36% 1.696ms 33.36% 1.696ms 1.696ms 1.058ms 33.87% 1.058ms 1.058ms 1 + cudaDeviceGetAttribute 0.07% 3.650us 0.07% 3.650us 0.243us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty_like 0.15% 7.421us 0.48% 24.201us 8.067us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 0.33% 16.780us 0.33% 16.780us 5.593us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty 0.42% 21.431us 0.42% 21.431us 2.381us 0.000us 0.00% 0.000us 0.000us 9 + cudaFuncSetAttribute 0.08% 4.070us 0.08% 4.070us 1.357us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 4.10% 208.474us 4.10% 208.474us 69.491us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 58.60% 2.980ms 58.60% 2.980ms 2.980ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 5.023ms -Self CUDA time total: 3.061ms +Self CPU time total: 5.085ms +Self CUDA time total: 3.124ms @@ -4039,21 +4039,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L448_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_flash_attn 1.91% 108.693us 38.60% 2.193ms 2.193ms 0.000us 0.00% 4.850ms 4.850ms 1 - _flash_attn_9e27194::fwd 0.87% 49.481us 36.69% 2.084ms 694.644us 3.635ms 100.00% 4.850ms 1.617ms 3 - hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.637ms 100.05% 3.637ms 3.637ms 1 -void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.635ms 100.00% 3.635ms 1.212ms 3 - Activity Buffer Request 31.43% 1.785ms 31.43% 1.785ms 1.785ms 1.215ms 33.41% 1.215ms 1.215ms 1 - cudaDeviceGetAttribute 0.07% 3.761us 0.07% 3.761us 0.251us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty_like 0.12% 6.970us 0.43% 24.340us 8.113us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 0.31% 17.370us 0.31% 17.370us 5.790us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty 0.43% 24.270us 0.43% 24.270us 2.697us 0.000us 0.00% 0.000us 0.000us 9 - cudaFuncSetAttribute 0.07% 3.730us 0.07% 3.730us 1.243us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 3.40% 193.224us 3.40% 193.224us 64.408us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 61.40% 3.487ms 61.40% 3.487ms 3.487ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_flash_attn 1.92% 106.253us 37.17% 2.059ms 2.059ms 0.000us 0.00% 4.843ms 4.843ms 1 + _flash_attn_9e27194::fwd 0.86% 47.751us 35.25% 1.953ms 651.011us 3.628ms 100.00% 4.843ms 1.614ms 3 + hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.629ms 100.04% 3.629ms 3.629ms 1 +void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.628ms 100.00% 3.628ms 1.209ms 3 + Activity Buffer Request 30.14% 1.670ms 30.14% 1.670ms 1.670ms 1.215ms 33.50% 1.215ms 1.215ms 1 + cudaDeviceGetAttribute 0.07% 3.881us 0.07% 3.881us 0.259us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty_like 0.14% 7.581us 0.43% 24.021us 8.007us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 0.30% 16.440us 0.30% 16.440us 5.480us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty 0.39% 21.710us 0.39% 21.710us 2.412us 0.000us 0.00% 0.000us 0.000us 9 + cudaFuncSetAttribute 0.07% 3.650us 0.07% 3.650us 1.217us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 3.29% 182.154us 3.29% 182.154us 60.718us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 62.83% 3.482ms 62.83% 3.482ms 3.482ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 5.680ms -Self CUDA time total: 3.635ms +Self CPU time total: 5.541ms +Self CUDA time total: 3.628ms @@ -4063,36 +4063,36 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L512_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_flash_attn 1.90% 106.201us 36.85% 2.064ms 2.064ms 0.000us 0.00% 4.915ms 4.915ms 1 - _flash_attn_9e27194::fwd 0.89% 50.062us 34.96% 1.958ms 652.751us 3.682ms 100.00% 4.915ms 1.638ms 3 - hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.684ms 100.05% 3.684ms 3.684ms 1 -void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.682ms 100.00% 3.682ms 1.227ms 3 - Activity Buffer Request 29.73% 1.666ms 29.73% 1.666ms 1.666ms 1.233ms 33.48% 1.233ms 1.233ms 1 - cudaDeviceGetAttribute 0.07% 4.189us 0.07% 4.189us 0.279us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty_like 0.12% 6.851us 0.45% 25.301us 8.434us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 0.33% 18.450us 0.33% 18.450us 6.150us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty 0.40% 22.632us 0.40% 22.632us 2.515us 0.000us 0.00% 0.000us 0.000us 9 - cudaFuncSetAttribute 0.07% 3.850us 0.07% 3.850us 1.283us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 3.33% 186.623us 3.33% 186.623us 62.208us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 63.15% 3.537ms 63.15% 3.537ms 3.537ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_flash_attn 1.86% 105.712us 36.76% 2.092ms 2.092ms 0.000us 0.00% 4.990ms 4.990ms 1 + _flash_attn_9e27194::fwd 0.87% 49.631us 34.91% 1.986ms 661.968us 3.741ms 100.00% 4.990ms 1.663ms 3 + hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.743ms 100.05% 3.743ms 3.743ms 1 +void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.741ms 100.00% 3.741ms 1.247ms 3 + Activity Buffer Request 29.90% 1.701ms 29.90% 1.701ms 1.701ms 1.249ms 33.38% 1.249ms 1.249ms 1 + cudaDeviceGetAttribute 0.06% 3.600us 0.06% 3.600us 0.240us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty_like 0.14% 7.780us 0.42% 24.150us 8.050us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 0.29% 16.370us 0.29% 16.370us 5.457us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty 0.38% 21.420us 0.38% 21.420us 2.380us 0.000us 0.00% 0.000us 0.000us 9 + cudaFuncSetAttribute 0.06% 3.580us 0.06% 3.580us 1.193us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 3.20% 182.154us 3.20% 182.154us 60.718us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 63.24% 3.598ms 63.24% 3.598ms 3.598ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 5.602ms -Self CUDA time total: 3.682ms +Self CPU time total: 5.689ms +Self CUDA time total: 3.741ms impl wl p50(ms) ok -hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.95 True -hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.00 True -hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.04 True -hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.06 True -hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.22 True -hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.24 True +hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.96 True +hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.01 True +hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.06 True +hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.08 True +hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.24 True +hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.25 True
Fetching 20 files: 0%| | 0/20 [00:00<?, ?it/s]Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads. -Fetching 20 files: 10%|█ | 2/20 [00:01<00:15, 1.16it/s] -Fetching 20 files: 100%|██████████| 20/20 [00:01<00:00, 11.63it/s] +Fetching 20 files: 10%|█ | 2/20 [00:01<00:14, 1.28it/s] +Fetching 20 files: 100%|██████████| 20/20 [00:01<00:00, 12.76it/s]

Artifacts:

diff --git a/flash_attn/impls/hf_kernels_flash_attn3.html b/flash_attn/impls/hf_kernels_flash_attn3.html index 6de96c197d41188179febfb987a9a39856ff098f..1bbaf0fb8d27a332d041bdaf6c64aab0b9f38de9 100644 --- a/flash_attn/impls/hf_kernels_flash_attn3.html +++ b/flash_attn/impls/hf_kernels_flash_attn3.html @@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content { ▼ output ▶ uv-logs | -Cell: benchmark | 6.33s +Cell: benchmark | 10.25s | Raw @@ -3942,19 +3942,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L128_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_flash_attn3 3.59% 165.413us 48.47% 2.234ms 2.234ms 0.000us 0.00% 3.561ms 3.561ms 1 - FlashAttnFunc 2.69% 124.054us 44.88% 2.069ms 689.509us 0.000us 0.00% 3.561ms 1.187ms 3 - _flash_attn3_1d39a44::fwd 1.63% 74.991us 42.19% 1.944ms 648.158us 2.673ms 100.00% 3.561ms 1.187ms 3 - hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.674ms 100.05% 2.674ms 2.674ms 1 -void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.673ms 100.00% 2.673ms 890.896us 3 - Activity Buffer Request 38.25% 1.763ms 38.25% 1.763ms 1.763ms 888.250us 33.23% 888.250us 888.250us 1 - aten::empty 0.95% 43.951us 0.95% 43.951us 7.325us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.32% 14.620us 0.32% 14.620us 4.873us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 1.04% 47.991us 1.04% 47.991us 15.997us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 51.53% 2.375ms 51.53% 2.375ms 2.375ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_flash_attn3 3.82% 178.994us 47.00% 2.205ms 2.205ms 0.000us 0.00% 3.693ms 3.693ms 1 + FlashAttnFunc 2.66% 124.811us 43.19% 2.026ms 675.274us 0.000us 0.00% 3.693ms 1.231ms 3 + _flash_attn3_1d39a44::fwd 1.59% 74.650us 40.52% 1.901ms 633.671us 2.792ms 100.00% 3.693ms 1.231ms 3 + hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.794ms 100.05% 2.794ms 2.794ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.792ms 100.00% 2.792ms 930.698us 3 + Activity Buffer Request 36.63% 1.718ms 36.63% 1.718ms 1.718ms 900.576us 32.25% 900.576us 900.576us 1 + aten::empty 0.99% 46.443us 0.99% 46.443us 7.741us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.32% 14.861us 0.32% 14.861us 4.954us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 1.00% 46.891us 1.00% 46.891us 15.630us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 53.00% 2.486ms 53.00% 2.486ms 2.486ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 4.609ms -Self CUDA time total: 2.673ms +Self CPU time total: 4.691ms +Self CUDA time total: 2.792ms @@ -3964,19 +3964,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L256_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_flash_attn3 2.68% 124.013us 44.92% 2.080ms 2.080ms 0.000us 0.00% 3.716ms 3.716ms 1 - FlashAttnFunc 1.96% 90.863us 42.24% 1.956ms 652.078us 0.000us 0.00% 3.716ms 1.239ms 3 - _flash_attn3_1d39a44::fwd 1.06% 49.109us 40.28% 1.865ms 621.790us 2.770ms 100.00% 3.716ms 1.239ms 3 - hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.772ms 100.05% 2.772ms 2.772ms 1 -void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.770ms 100.00% 2.770ms 923.461us 3 - Activity Buffer Request 37.83% 1.752ms 37.83% 1.752ms 1.752ms 945.210us 34.12% 945.210us 945.210us 1 - aten::empty 0.60% 27.931us 0.60% 27.931us 4.655us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.12% 5.520us 0.12% 5.520us 1.840us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 0.67% 30.831us 0.67% 30.831us 10.277us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 55.08% 2.551ms 55.08% 2.551ms 2.551ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_flash_attn3 2.16% 100.183us 44.03% 2.042ms 2.042ms 0.000us 0.00% 3.752ms 3.752ms 1 + FlashAttnFunc 1.96% 91.001us 41.87% 1.942ms 647.204us 0.000us 0.00% 3.752ms 1.251ms 3 + _flash_attn3_1d39a44::fwd 1.03% 47.561us 39.91% 1.851ms 616.870us 2.814ms 100.00% 3.752ms 1.251ms 3 + hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.816ms 100.05% 2.816ms 2.816ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.814ms 100.00% 2.814ms 938.079us 3 + Activity Buffer Request 37.49% 1.739ms 37.49% 1.739ms 1.739ms 937.887us 33.33% 937.887us 937.887us 1 + aten::empty 0.58% 26.762us 0.58% 26.762us 4.460us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.11% 5.220us 0.11% 5.220us 1.740us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 0.70% 32.410us 0.70% 32.410us 10.803us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 55.97% 2.595ms 55.97% 2.595ms 2.595ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 4.631ms -Self CUDA time total: 2.770ms +Self CPU time total: 4.637ms +Self CUDA time total: 2.814ms @@ -3986,19 +3986,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L320_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_flash_attn3 2.68% 125.914us 44.02% 2.072ms 2.072ms 0.000us 0.00% 3.816ms 3.816ms 1 - FlashAttnFunc 1.89% 89.112us 41.34% 1.946ms 648.608us 0.000us 0.00% 3.816ms 1.272ms 3 - _flash_attn3_1d39a44::fwd 1.01% 47.500us 39.45% 1.857ms 618.904us 2.847ms 100.00% 3.816ms 1.272ms 3 - hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.849ms 100.05% 2.849ms 2.849ms 1 -void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.847ms 100.00% 2.847ms 949.087us 3 - Activity Buffer Request 37.07% 1.745ms 37.07% 1.745ms 1.745ms 968.895us 34.03% 968.895us 968.895us 1 - aten::empty 0.58% 27.171us 0.58% 27.171us 4.529us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.12% 5.621us 0.12% 5.621us 1.874us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 0.67% 31.690us 0.67% 31.690us 10.563us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 55.98% 2.635ms 55.98% 2.635ms 2.635ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_flash_attn3 2.13% 100.213us 42.34% 1.994ms 1.994ms 0.000us 0.00% 3.924ms 3.924ms 1 + FlashAttnFunc 1.82% 85.940us 40.21% 1.894ms 631.253us 0.000us 0.00% 3.924ms 1.308ms 3 + _flash_attn3_1d39a44::fwd 1.03% 48.325us 38.38% 1.808ms 602.607us 2.927ms 100.00% 3.924ms 1.308ms 3 + hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.929ms 100.05% 2.929ms 2.929ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.927ms 100.00% 2.927ms 975.684us 3 + Activity Buffer Request 36.02% 1.697ms 36.02% 1.697ms 1.697ms 997.252us 34.07% 997.252us 997.252us 1 + aten::empty 0.56% 26.419us 0.56% 26.419us 4.403us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.12% 5.490us 0.12% 5.490us 1.830us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 0.66% 31.020us 0.66% 31.020us 10.340us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 57.66% 2.716ms 57.66% 2.716ms 2.716ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 4.706ms -Self CUDA time total: 2.847ms +Self CPU time total: 4.710ms +Self CUDA time total: 2.927ms @@ -4008,19 +4008,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L384_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_flash_attn3 2.55% 127.134us 45.51% 2.268ms 2.268ms 0.000us 0.00% 3.920ms 3.920ms 1 - FlashAttnFunc 1.80% 89.881us 42.96% 2.141ms 713.505us 0.000us 0.00% 3.920ms 1.307ms 3 - _flash_attn3_1d39a44::fwd 0.97% 48.541us 41.15% 2.051ms 683.545us 2.930ms 100.00% 3.920ms 1.307ms 3 - hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.932ms 100.05% 2.932ms 2.932ms 1 -void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.930ms 100.00% 2.930ms 976.824us 3 - Activity Buffer Request 35.08% 1.748ms 35.08% 1.748ms 1.748ms 989.112us 33.75% 989.112us 989.112us 1 - aten::empty 0.54% 27.071us 0.54% 27.071us 4.512us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.11% 5.498us 0.11% 5.498us 1.833us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 4.45% 221.646us 4.45% 221.646us 73.882us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 54.49% 2.715ms 54.49% 2.715ms 2.715ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_flash_attn3 2.19% 98.471us 39.26% 1.764ms 1.764ms 0.000us 0.00% 3.945ms 3.945ms 1 + FlashAttnFunc 1.97% 88.443us 37.06% 1.666ms 555.216us 0.000us 0.00% 3.945ms 1.315ms 3 + _flash_attn3_1d39a44::fwd 1.11% 49.881us 35.10% 1.577ms 525.735us 2.942ms 100.00% 3.945ms 1.315ms 3 + hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.943ms 100.05% 2.943ms 2.943ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.942ms 100.00% 2.942ms 980.556us 3 + Activity Buffer Request 27.81% 1.250ms 27.81% 1.250ms 1.250ms 1.003ms 34.09% 1.003ms 1.003ms 1 + aten::empty 0.60% 26.780us 0.60% 26.780us 4.463us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.11% 5.141us 0.11% 5.141us 1.714us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 5.46% 245.555us 5.46% 245.555us 81.852us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 60.74% 2.730ms 60.74% 2.730ms 2.730ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 4.983ms -Self CUDA time total: 2.930ms +Self CPU time total: 4.494ms +Self CUDA time total: 2.942ms @@ -4030,19 +4030,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L448_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_flash_attn3 2.34% 128.034us 40.76% 2.227ms 2.227ms 0.000us 0.00% 4.607ms 4.607ms 1 - FlashAttnFunc 1.67% 91.131us 38.42% 2.098ms 699.492us 0.000us 0.00% 4.607ms 1.536ms 3 - _flash_attn3_1d39a44::fwd 0.87% 47.661us 36.75% 2.007ms 669.115us 3.452ms 100.00% 4.607ms 1.536ms 3 - hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.453ms 100.05% 3.453ms 3.453ms 1 -void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.452ms 100.00% 3.452ms 1.151ms 3 - Activity Buffer Request 31.93% 1.744ms 31.93% 1.744ms 1.744ms 1.156ms 33.48% 1.156ms 1.156ms 1 - aten::empty 0.52% 28.231us 0.52% 28.231us 4.705us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.10% 5.270us 0.10% 5.270us 1.757us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 3.33% 181.994us 3.33% 181.994us 60.665us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 59.24% 3.235ms 59.24% 3.235ms 3.235ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_flash_attn3 1.83% 100.852us 39.93% 2.202ms 2.202ms 0.000us 0.00% 4.714ms 4.714ms 1 + FlashAttnFunc 1.62% 89.332us 38.10% 2.101ms 700.422us 0.000us 0.00% 4.714ms 1.571ms 3 + _flash_attn3_1d39a44::fwd 0.86% 47.622us 36.48% 2.012ms 670.645us 3.530ms 100.00% 4.714ms 1.571ms 3 + hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.531ms 100.04% 3.531ms 3.531ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.530ms 100.00% 3.530ms 1.177ms 3 + Activity Buffer Request 31.48% 1.736ms 31.48% 1.736ms 1.736ms 1.184ms 33.56% 1.184ms 1.184ms 1 + aten::empty 0.51% 27.890us 0.51% 27.890us 4.648us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.09% 5.140us 0.09% 5.140us 1.713us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 3.53% 194.875us 3.53% 194.875us 64.958us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 60.07% 3.313ms 60.07% 3.313ms 3.313ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 5.462ms -Self CUDA time total: 3.452ms +Self CPU time total: 5.515ms +Self CUDA time total: 3.530ms @@ -4052,40 +4052,39 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L512_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_flash_attn3 2.42% 135.303us 41.95% 2.345ms 2.345ms 0.000us 0.00% 4.617ms 4.617ms 1 - FlashAttnFunc 1.78% 99.322us 39.53% 2.210ms 736.513us 0.000us 0.00% 4.617ms 1.539ms 3 - _flash_attn3_1d39a44::fwd 0.92% 51.382us 37.75% 2.110ms 703.406us 3.463ms 100.00% 4.617ms 1.539ms 3 - hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.464ms 100.05% 3.464ms 3.464ms 1 -void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.463ms 100.00% 3.463ms 1.154ms 3 - Activity Buffer Request 33.12% 1.851ms 33.12% 1.851ms 1.851ms 1.155ms 33.34% 1.155ms 1.155ms 1 - aten::empty 0.54% 30.101us 0.54% 30.101us 5.017us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.10% 5.430us 0.10% 5.430us 1.810us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 3.08% 171.953us 3.08% 171.953us 57.318us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 58.05% 3.245ms 58.05% 3.245ms 3.245ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_flash_attn3 1.85% 100.143us 39.23% 2.129ms 2.129ms 0.000us 0.00% 4.688ms 4.688ms 1 + FlashAttnFunc 1.59% 86.190us 37.39% 2.029ms 676.324us 0.000us 0.00% 4.688ms 1.563ms 3 + _flash_attn3_1d39a44::fwd 0.90% 48.962us 35.80% 1.943ms 647.594us 3.510ms 100.00% 4.688ms 1.563ms 3 + hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.512ms 100.05% 3.512ms 3.512ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.510ms 100.00% 3.510ms 1.170ms 3 + Activity Buffer Request 31.16% 1.691ms 31.16% 1.691ms 1.691ms 1.178ms 33.55% 1.178ms 1.178ms 1 + aten::empty 0.49% 26.491us 0.49% 26.491us 4.415us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.09% 5.060us 0.09% 5.060us 1.687us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 3.15% 171.134us 3.15% 171.134us 57.045us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 60.77% 3.297ms 60.77% 3.297ms 3.297ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 5.590ms -Self CUDA time total: 3.463ms +Self CPU time total: 5.427ms +Self CUDA time total: 3.510ms impl wl p50(ms) ok -hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.91 True +hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.93 True hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.98 True hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.01 True -hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.01 True -hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.18 True -hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.17 True +hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.02 True +hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.19 True +hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.22 True
▶ UV Install Logs
-
Fetching 5 files: 0%| | 0/5 [00:00<?, ?it/s]Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads. - -Fetching 5 files: 20%|██ | 1/5 [00:00<00:01, 3.45it/s] -Fetching 5 files: 40%|████ | 2/5 [00:01<00:02, 1.11it/s] -Fetching 5 files: 100%|██████████| 5/5 [00:01<00:00, 3.08it/s]
+
Fetching 5 files: 0%| | 0/5 [00:00<?, ?it/s] +Fetching 5 files: 20%|██ | 1/5 [00:00<00:00, 9.30it/s] +Fetching 5 files: 40%|████ | 2/5 [00:01<00:02, 1.12it/s] +Fetching 5 files: 100%|██████████| 5/5 [00:01<00:00, 3.22it/s]

Artifacts:

attention.jsonl diff --git a/flash_attn/impls/mem_efficient_attention.html b/flash_attn/impls/mem_efficient_attention.html index b26e66d7ef0ab7e42a4fc5cbe5c4536eb65fe29a..15014fc5e8e1dd16c75ce6a88c2556b04d0216c8 100644 --- a/flash_attn/impls/mem_efficient_attention.html +++ b/flash_attn/impls/mem_efficient_attention.html @@ -3886,9 +3886,9 @@ body[data-tool="eraser"] .main-content { ▼ code ▼ output - ▶ uv-logs + ▶ uv-logs | -Cell: benchmark | 8.14s +Cell: benchmark | 4.12s | Raw @@ -3941,28 +3941,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L128_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.462ms 101.52% 5.462ms 5.462ms 1 - torch_mem_eff 4.78% 351.785us 36.36% 2.675ms 2.675ms 0.000us 0.00% 5.434ms 5.434ms 1 - aten::scaled_dot_product_attention 0.44% 32.361us 3.09% 227.216us 75.739us 0.000us 0.00% 4.760ms 1.587ms 3 - aten::_scaled_dot_product_efficient_attention 0.32% 23.392us 2.65% 194.855us 64.952us 0.000us 0.00% 4.760ms 1.587ms 3 - aten::_efficient_attention_forward 0.47% 34.731us 1.98% 145.602us 48.534us 4.760ms 88.47% 4.760ms 1.587ms 3 -fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 4.760ms 88.47% 4.760ms 1.587ms 3 - aten::contiguous 0.14% 10.161us 27.51% 2.023ms 224.817us 0.000us 0.00% 673.947us 74.883us 9 - aten::clone 0.40% 29.063us 27.37% 2.013ms 223.688us 0.000us 0.00% 673.947us 74.883us 9 - aten::copy_ 1.06% 77.620us 25.90% 1.905ms 211.680us 620.444us 11.53% 673.947us 74.883us 9 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 620.444us 11.53% 620.444us 68.938us 9 - Activity Buffer Request 23.68% 1.742ms 23.68% 1.742ms 1.742ms 53.503us 0.99% 53.503us 53.503us 1 - aten::transpose 0.99% 72.964us 1.33% 98.194us 4.091us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.34% 25.230us 0.34% 25.230us 1.051us 0.000us 0.00% 0.000us 0.000us 24 - aten::empty_like 0.25% 18.168us 1.07% 79.009us 8.779us 0.000us 0.00% 0.000us 0.000us 9 - aten::empty 1.28% 94.381us 1.28% 94.381us 4.494us 0.000us 0.00% 0.000us 0.000us 21 - cudaLaunchKernel 1.49% 109.573us 1.49% 109.573us 9.131us 0.000us 0.00% 0.000us 0.000us 12 - cudaStreamIsCapturing 0.05% 3.660us 0.05% 3.660us 1.220us 0.000us 0.00% 0.000us 0.000us 3 - cudaFuncSetAttribute 0.67% 49.491us 0.67% 49.491us 16.497us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 63.64% 4.681ms 63.64% 4.681ms 4.681ms 0.000us 0.00% 0.000us 0.000us 1 + torch_mem_eff 4.25% 311.827us 34.94% 2.563ms 2.563ms 0.000us 0.00% 5.488ms 5.488ms 1 + torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.465ms 100.56% 5.465ms 5.465ms 1 + aten::scaled_dot_product_attention 0.42% 30.830us 2.38% 174.593us 58.198us 0.000us 0.00% 4.817ms 1.606ms 3 + aten::_scaled_dot_product_efficient_attention 0.32% 23.429us 1.96% 143.763us 47.921us 0.000us 0.00% 4.817ms 1.606ms 3 + aten::_efficient_attention_forward 0.46% 33.832us 1.33% 97.922us 32.641us 4.817ms 88.64% 4.817ms 1.606ms 3 +fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 4.817ms 88.64% 4.817ms 1.606ms 3 + aten::contiguous 0.14% 10.180us 27.43% 2.012ms 223.532us 0.000us 0.00% 670.850us 74.539us 9 + aten::clone 0.43% 31.262us 27.29% 2.002ms 222.401us 0.000us 0.00% 670.850us 74.539us 9 + aten::copy_ 1.01% 74.042us 25.85% 1.896ms 210.687us 617.346us 11.36% 670.850us 74.539us 9 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 617.346us 11.36% 617.346us 68.594us 9 + Activity Buffer Request 23.70% 1.738ms 23.70% 1.738ms 1.738ms 53.504us 0.98% 53.504us 53.504us 1 + aten::transpose 0.89% 65.502us 1.19% 87.343us 3.639us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.30% 21.841us 0.30% 21.841us 0.910us 0.000us 0.00% 0.000us 0.000us 24 + aten::empty_like 0.21% 15.520us 1.01% 74.161us 8.240us 0.000us 0.00% 0.000us 0.000us 9 + aten::empty 1.17% 85.772us 1.17% 85.772us 4.084us 0.000us 0.00% 0.000us 0.000us 21 + cudaLaunchKernel 1.48% 108.273us 1.48% 108.273us 9.023us 0.000us 0.00% 0.000us 0.000us 12 + cudaStreamIsCapturing 0.05% 3.869us 0.05% 3.869us 1.290us 0.000us 0.00% 0.000us 0.000us 3 + cudaFuncSetAttribute 0.12% 8.830us 0.12% 8.830us 2.943us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 65.06% 4.772ms 65.06% 4.772ms 4.772ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 7.356ms -Self CUDA time total: 5.380ms +Self CPU time total: 7.335ms +Self CUDA time total: 5.434ms @@ -3972,28 +3972,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L256_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_mem_eff 2.99% 227.637us 31.17% 2.369ms 2.369ms 0.000us 0.00% 5.835ms 5.835ms 1 - torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.790ms 100.14% 5.790ms 5.790ms 1 - aten::scaled_dot_product_attention 0.23% 17.721us 1.87% 142.143us 47.381us 0.000us 0.00% 5.146ms 1.715ms 3 - aten::_scaled_dot_product_efficient_attention 0.25% 18.819us 1.64% 124.422us 41.474us 0.000us 0.00% 5.146ms 1.715ms 3 - aten::_efficient_attention_forward 0.37% 28.141us 1.08% 82.262us 27.421us 5.146ms 89.01% 5.146ms 1.715ms 3 -fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.146ms 89.01% 5.146ms 1.715ms 3 - aten::contiguous 0.09% 6.739us 25.75% 1.957ms 217.483us 0.000us 0.00% 689.503us 76.611us 9 - aten::clone 0.27% 20.691us 25.66% 1.951ms 216.734us 0.000us 0.00% 689.503us 76.611us 9 - aten::copy_ 0.83% 62.851us 24.72% 1.879ms 208.808us 635.680us 10.99% 689.503us 76.611us 9 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 635.680us 10.99% 635.680us 70.631us 9 - Activity Buffer Request 23.06% 1.753ms 23.06% 1.753ms 1.753ms 53.823us 0.93% 53.823us 53.823us 1 - aten::transpose 0.63% 47.890us 0.86% 65.431us 2.726us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.23% 17.541us 0.23% 17.541us 0.731us 0.000us 0.00% 0.000us 0.000us 24 - aten::empty_like 0.15% 11.310us 0.67% 50.641us 5.627us 0.000us 0.00% 0.000us 0.000us 9 - aten::empty 0.87% 66.232us 0.87% 66.232us 3.154us 0.000us 0.00% 0.000us 0.000us 21 - cudaLaunchKernel 1.12% 85.492us 1.12% 85.492us 7.124us 0.000us 0.00% 0.000us 0.000us 12 - cudaStreamIsCapturing 0.03% 2.460us 0.03% 2.460us 0.820us 0.000us 0.00% 0.000us 0.000us 3 - cudaFuncSetAttribute 0.04% 3.070us 0.04% 3.070us 1.023us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 68.83% 5.232ms 68.83% 5.232ms 5.232ms 0.000us 0.00% 0.000us 0.000us 1 + torch_mem_eff 3.26% 247.835us 31.36% 2.385ms 2.385ms 0.000us 0.00% 5.867ms 5.867ms 1 + torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.821ms 100.14% 5.821ms 5.821ms 1 + aten::scaled_dot_product_attention 0.22% 16.881us 1.81% 137.424us 45.808us 0.000us 0.00% 5.175ms 1.725ms 3 + aten::_scaled_dot_product_efficient_attention 0.25% 18.660us 1.59% 120.543us 40.181us 0.000us 0.00% 5.175ms 1.725ms 3 + aten::_efficient_attention_forward 0.35% 26.843us 1.04% 78.951us 26.317us 5.175ms 89.03% 5.175ms 1.725ms 3 +fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.175ms 89.03% 5.175ms 1.725ms 3 + aten::contiguous 0.09% 7.172us 25.72% 1.955ms 217.264us 0.000us 0.00% 691.584us 76.843us 9 + aten::clone 0.31% 23.260us 25.62% 1.948ms 216.467us 0.000us 0.00% 691.584us 76.843us 9 + aten::copy_ 0.84% 64.031us 24.18% 1.839ms 204.318us 637.408us 10.97% 691.584us 76.843us 9 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 637.408us 10.97% 637.408us 70.823us 9 + Activity Buffer Request 22.42% 1.705ms 22.42% 1.705ms 1.705ms 54.176us 0.93% 54.176us 54.176us 1 + aten::transpose 0.64% 49.041us 0.88% 66.991us 2.791us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.24% 17.950us 0.24% 17.950us 0.748us 0.000us 0.00% 0.000us 0.000us 24 + aten::empty_like 0.17% 12.602us 1.13% 86.083us 9.565us 0.000us 0.00% 0.000us 0.000us 9 + aten::empty 1.29% 98.070us 1.29% 98.070us 4.670us 0.000us 0.00% 0.000us 0.000us 21 + cudaLaunchKernel 1.22% 92.470us 1.22% 92.470us 7.706us 0.000us 0.00% 0.000us 0.000us 12 + cudaStreamIsCapturing 0.04% 2.690us 0.04% 2.690us 0.897us 0.000us 0.00% 0.000us 0.000us 3 + cudaFuncSetAttribute 0.04% 2.679us 0.04% 2.679us 0.893us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 68.64% 5.219ms 68.64% 5.219ms 5.219ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 7.601ms -Self CUDA time total: 5.782ms +Self CPU time total: 7.603ms +Self CUDA time total: 5.812ms @@ -4003,28 +4003,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L320_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_mem_eff 2.88% 222.044us 30.17% 2.327ms 2.327ms 0.000us 0.00% 5.986ms 5.986ms 1 - torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.939ms 100.13% 5.939ms 5.939ms 1 - aten::scaled_dot_product_attention 0.24% 18.710us 1.85% 142.303us 47.434us 0.000us 0.00% 5.284ms 1.761ms 3 - aten::_scaled_dot_product_efficient_attention 0.25% 19.190us 1.60% 123.593us 41.198us 0.000us 0.00% 5.284ms 1.761ms 3 - aten::_efficient_attention_forward 0.36% 27.947us 1.05% 81.281us 27.094us 5.284ms 89.10% 5.284ms 1.761ms 3 -fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.284ms 89.10% 5.284ms 1.761ms 3 - aten::contiguous 0.09% 7.300us 24.90% 1.920ms 213.350us 0.000us 0.00% 702.238us 78.026us 9 - aten::clone 0.28% 21.930us 24.80% 1.913ms 212.539us 0.000us 0.00% 702.238us 78.026us 9 - aten::copy_ 0.79% 60.872us 23.86% 1.840ms 204.449us 646.526us 10.90% 702.238us 78.026us 9 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 646.526us 10.90% 646.526us 71.836us 9 - Activity Buffer Request 22.23% 1.715ms 22.23% 1.715ms 1.715ms 55.712us 0.94% 55.712us 55.712us 1 - aten::transpose 0.63% 48.814us 0.85% 65.893us 2.746us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.22% 17.079us 0.22% 17.079us 0.712us 0.000us 0.00% 0.000us 0.000us 24 - aten::empty_like 0.15% 11.801us 0.66% 50.882us 5.654us 0.000us 0.00% 0.000us 0.000us 9 - aten::empty 0.85% 65.644us 0.85% 65.644us 3.126us 0.000us 0.00% 0.000us 0.000us 21 - cudaLaunchKernel 1.11% 85.622us 1.11% 85.622us 7.135us 0.000us 0.00% 0.000us 0.000us 12 - cudaStreamIsCapturing 0.03% 2.511us 0.03% 2.511us 0.837us 0.000us 0.00% 0.000us 0.000us 3 - cudaFuncSetAttribute 0.04% 3.110us 0.04% 3.110us 1.037us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 69.83% 5.385ms 69.83% 5.385ms 5.385ms 0.000us 0.00% 0.000us 0.000us 1 + torch_mem_eff 3.07% 241.867us 30.18% 2.381ms 2.381ms 0.000us 0.00% 6.114ms 6.114ms 1 + torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.067ms 100.14% 6.067ms 6.067ms 1 + aten::scaled_dot_product_attention 0.22% 17.069us 1.75% 137.963us 45.988us 0.000us 0.00% 5.411ms 1.804ms 3 + aten::_scaled_dot_product_efficient_attention 0.24% 18.570us 1.53% 120.894us 40.298us 0.000us 0.00% 5.411ms 1.804ms 3 + aten::_efficient_attention_forward 0.35% 27.663us 1.02% 80.252us 26.751us 5.411ms 89.32% 5.411ms 1.804ms 3 +fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.411ms 89.32% 5.411ms 1.804ms 3 + aten::contiguous 0.11% 8.338us 24.80% 1.957ms 217.397us 0.000us 0.00% 703.296us 78.144us 9 + aten::clone 0.29% 22.493us 24.69% 1.948ms 216.470us 0.000us 0.00% 703.296us 78.144us 9 + aten::copy_ 0.83% 65.242us 23.73% 1.872ms 208.052us 647.296us 10.68% 703.296us 78.144us 9 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 647.296us 10.68% 647.296us 71.922us 9 + Activity Buffer Request 22.06% 1.740ms 22.06% 1.740ms 1.740ms 56.000us 0.92% 56.000us 56.000us 1 + aten::transpose 0.64% 50.792us 0.85% 67.072us 2.795us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.21% 16.280us 0.21% 16.280us 0.678us 0.000us 0.00% 0.000us 0.000us 24 + aten::empty_like 0.15% 11.839us 0.68% 53.270us 5.919us 0.000us 0.00% 0.000us 0.000us 9 + aten::empty 0.84% 66.171us 0.84% 66.171us 3.151us 0.000us 0.00% 0.000us 0.000us 21 + cudaLaunchKernel 1.13% 89.500us 1.13% 89.500us 7.458us 0.000us 0.00% 0.000us 0.000us 12 + cudaStreamIsCapturing 0.03% 2.430us 0.03% 2.430us 0.810us 0.000us 0.00% 0.000us 0.000us 3 + cudaFuncSetAttribute 0.03% 2.650us 0.03% 2.650us 0.883us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 69.82% 5.508ms 69.82% 5.508ms 5.508ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 7.713ms -Self CUDA time total: 5.931ms +Self CPU time total: 7.890ms +Self CUDA time total: 6.058ms @@ -4034,28 +4034,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L384_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_mem_eff 3.05% 248.737us 32.15% 2.620ms 2.620ms 0.000us 0.00% 6.167ms 6.167ms 1 - torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.117ms 100.13% 6.117ms 6.117ms 1 - aten::scaled_dot_product_attention 0.24% 19.380us 1.81% 147.173us 49.058us 0.000us 0.00% 5.450ms 1.817ms 3 - aten::_scaled_dot_product_efficient_attention 0.23% 19.059us 1.57% 127.793us 42.598us 0.000us 0.00% 5.450ms 1.817ms 3 - aten::_efficient_attention_forward 0.34% 28.111us 1.04% 84.373us 28.124us 5.450ms 89.21% 5.450ms 1.817ms 3 -fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.450ms 89.21% 5.450ms 1.817ms 3 - aten::contiguous 0.09% 7.070us 26.79% 2.183ms 242.545us 0.000us 0.00% 717.472us 79.719us 9 - aten::clone 0.26% 21.211us 26.70% 2.176ms 241.760us 0.000us 0.00% 717.472us 79.719us 9 - aten::copy_ 0.77% 62.427us 25.76% 2.100ms 233.287us 658.976us 10.79% 717.472us 79.719us 9 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 658.976us 10.79% 658.976us 73.220us 9 - Activity Buffer Request 21.68% 1.767ms 21.68% 1.767ms 1.767ms 58.496us 0.96% 58.496us 58.496us 1 - aten::transpose 0.59% 47.765us 0.81% 65.883us 2.745us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.22% 18.118us 0.22% 18.118us 0.755us 0.000us 0.00% 0.000us 0.000us 24 - aten::empty_like 0.14% 11.420us 0.68% 55.041us 6.116us 0.000us 0.00% 0.000us 0.000us 9 - aten::empty 0.87% 71.281us 0.87% 71.281us 3.394us 0.000us 0.00% 0.000us 0.000us 21 - cudaLaunchKernel 3.59% 292.889us 3.59% 292.889us 24.407us 0.000us 0.00% 0.000us 0.000us 12 - cudaStreamIsCapturing 0.03% 2.781us 0.03% 2.781us 0.927us 0.000us 0.00% 0.000us 0.000us 3 - cudaFuncSetAttribute 0.04% 3.020us 0.04% 3.020us 1.007us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 67.85% 5.529ms 67.85% 5.529ms 5.529ms 0.000us 0.00% 0.000us 0.000us 1 + torch_mem_eff 3.00% 245.113us 31.96% 2.610ms 2.610ms 0.000us 0.00% 6.162ms 6.162ms 1 + torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.113ms 100.14% 6.113ms 6.113ms 1 + aten::scaled_dot_product_attention 0.20% 16.700us 1.71% 139.473us 46.491us 0.000us 0.00% 5.450ms 1.817ms 3 + aten::_scaled_dot_product_efficient_attention 0.23% 18.811us 1.50% 122.773us 40.924us 0.000us 0.00% 5.450ms 1.817ms 3 + aten::_efficient_attention_forward 0.34% 27.691us 0.98% 80.171us 26.724us 5.450ms 89.27% 5.450ms 1.817ms 3 +fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.450ms 89.27% 5.450ms 1.817ms 3 + aten::contiguous 0.09% 7.732us 26.74% 2.184ms 242.673us 0.000us 0.00% 712.645us 79.183us 9 + aten::clone 0.28% 22.711us 26.65% 2.176ms 241.814us 0.000us 0.00% 712.645us 79.183us 9 + aten::copy_ 0.78% 63.988us 25.72% 2.101ms 233.430us 654.820us 10.73% 712.645us 79.183us 9 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 654.820us 10.73% 654.820us 72.758us 9 + Activity Buffer Request 21.86% 1.785ms 21.86% 1.785ms 1.785ms 57.825us 0.95% 57.825us 57.825us 1 + aten::transpose 0.59% 47.982us 0.80% 65.243us 2.718us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.21% 17.261us 0.21% 17.261us 0.719us 0.000us 0.00% 0.000us 0.000us 24 + aten::empty_like 0.14% 11.742us 0.65% 52.742us 5.860us 0.000us 0.00% 0.000us 0.000us 9 + aten::empty 0.82% 66.990us 0.82% 66.990us 3.190us 0.000us 0.00% 0.000us 0.000us 21 + cudaLaunchKernel 3.34% 272.558us 3.34% 272.558us 22.713us 0.000us 0.00% 0.000us 0.000us 12 + cudaStreamIsCapturing 0.03% 2.519us 0.03% 2.519us 0.840us 0.000us 0.00% 0.000us 0.000us 3 + cudaFuncSetAttribute 0.03% 2.830us 0.03% 2.830us 0.943us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 68.04% 5.557ms 68.04% 5.557ms 5.557ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 8.150ms -Self CUDA time total: 6.109ms +Self CPU time total: 8.167ms +Self CUDA time total: 6.105ms @@ -4065,28 +4065,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L448_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_mem_eff 2.74% 222.904us 29.02% 2.363ms 2.363ms 0.000us 0.00% 6.392ms 6.392ms 1 - torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.341ms 100.13% 6.341ms 6.341ms 1 - aten::scaled_dot_product_attention 0.23% 18.463us 1.76% 143.054us 47.685us 0.000us 0.00% 5.664ms 1.888ms 3 - aten::_scaled_dot_product_efficient_attention 0.23% 18.699us 1.53% 124.591us 41.530us 0.000us 0.00% 5.664ms 1.888ms 3 - aten::_efficient_attention_forward 0.35% 28.650us 1.01% 82.071us 27.357us 5.664ms 89.43% 5.664ms 1.888ms 3 -fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.664ms 89.43% 5.664ms 1.888ms 3 - aten::contiguous 0.09% 7.480us 24.00% 1.954ms 217.122us 0.000us 0.00% 727.838us 80.871us 9 - aten::clone 0.26% 21.231us 23.90% 1.947ms 216.290us 0.000us 0.00% 727.838us 80.871us 9 - aten::copy_ 0.78% 63.523us 23.01% 1.874ms 208.176us 669.182us 10.57% 727.838us 80.871us 9 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 669.182us 10.57% 669.182us 74.354us 9 - Activity Buffer Request 19.19% 1.562ms 19.19% 1.562ms 1.562ms 58.656us 0.93% 58.656us 58.656us 1 - aten::transpose 0.60% 48.754us 0.82% 66.672us 2.778us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.22% 17.918us 0.22% 17.918us 0.747us 0.000us 0.00% 0.000us 0.000us 24 - aten::empty_like 0.14% 11.269us 0.64% 51.800us 5.756us 0.000us 0.00% 0.000us 0.000us 9 - aten::empty 0.81% 66.291us 0.81% 66.291us 3.157us 0.000us 0.00% 0.000us 0.000us 21 - cudaLaunchKernel 3.31% 269.756us 3.31% 269.756us 22.480us 0.000us 0.00% 0.000us 0.000us 12 - cudaStreamIsCapturing 0.03% 2.590us 0.03% 2.590us 0.863us 0.000us 0.00% 0.000us 0.000us 3 - cudaFuncSetAttribute 0.04% 2.940us 0.04% 2.940us 0.980us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 70.98% 5.781ms 70.98% 5.781ms 5.781ms 0.000us 0.00% 0.000us 0.000us 1 + torch_mem_eff 2.93% 244.444us 30.49% 2.544ms 2.544ms 0.000us 0.00% 6.411ms 6.411ms 1 + torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.361ms 100.14% 6.361ms 6.361ms 1 + aten::scaled_dot_product_attention 0.20% 16.791us 1.67% 139.273us 46.424us 0.000us 0.00% 5.684ms 1.895ms 3 + aten::_scaled_dot_product_efficient_attention 0.23% 19.350us 1.47% 122.482us 40.827us 0.000us 0.00% 5.684ms 1.895ms 3 + aten::_efficient_attention_forward 0.32% 26.939us 0.96% 79.712us 26.571us 5.684ms 89.48% 5.684ms 1.895ms 3 +fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.684ms 89.48% 5.684ms 1.895ms 3 + aten::contiguous 0.10% 8.370us 25.37% 2.117ms 235.225us 0.000us 0.00% 726.946us 80.772us 9 + aten::clone 0.27% 22.301us 25.27% 2.109ms 234.295us 0.000us 0.00% 726.946us 80.772us 9 + aten::copy_ 0.79% 65.502us 24.38% 2.034ms 226.048us 668.514us 10.52% 726.946us 80.772us 9 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 668.514us 10.52% 668.514us 74.279us 9 + Activity Buffer Request 20.48% 1.709ms 20.48% 1.709ms 1.709ms 58.432us 0.92% 58.432us 58.432us 1 + aten::transpose 0.59% 49.601us 0.80% 67.072us 2.795us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.21% 17.471us 0.21% 17.471us 0.728us 0.000us 0.00% 0.000us 0.000us 24 + aten::empty_like 0.14% 11.518us 0.62% 51.920us 5.769us 0.000us 0.00% 0.000us 0.000us 9 + aten::empty 0.81% 67.173us 0.81% 67.173us 3.199us 0.000us 0.00% 0.000us 0.000us 21 + cudaLaunchKernel 3.36% 280.595us 3.36% 280.595us 23.383us 0.000us 0.00% 0.000us 0.000us 12 + cudaStreamIsCapturing 0.03% 2.391us 0.03% 2.391us 0.797us 0.000us 0.00% 0.000us 0.000us 3 + cudaFuncSetAttribute 0.03% 2.751us 0.03% 2.751us 0.917us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 69.51% 5.799ms 69.51% 5.799ms 5.799ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 8.144ms -Self CUDA time total: 6.333ms +Self CPU time total: 8.344ms +Self CUDA time total: 6.353ms @@ -4096,44 +4096,38 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L512_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_mem_eff 2.91% 254.056us 31.19% 2.722ms 2.722ms 0.000us 0.00% 6.645ms 6.645ms 1 - torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.592ms 100.12% 6.592ms 6.592ms 1 - aten::scaled_dot_product_attention 0.23% 20.440us 1.69% 147.533us 49.178us 0.000us 0.00% 5.910ms 1.970ms 3 - aten::_scaled_dot_product_efficient_attention 0.22% 19.250us 1.46% 127.093us 42.364us 0.000us 0.00% 5.910ms 1.970ms 3 - aten::_efficient_attention_forward 0.33% 28.899us 0.98% 85.242us 28.414us 5.910ms 89.76% 5.910ms 1.970ms 3 -fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.910ms 89.76% 5.910ms 1.970ms 3 - aten::contiguous 0.08% 7.268us 26.04% 2.272ms 252.404us 0.000us 0.00% 734.815us 81.646us 9 - aten::clone 0.28% 24.054us 25.95% 2.264ms 251.596us 0.000us 0.00% 734.815us 81.646us 9 - aten::copy_ 0.77% 66.891us 25.04% 2.185ms 242.745us 674.239us 10.24% 734.815us 81.646us 9 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 674.239us 10.24% 674.239us 74.915us 9 - Activity Buffer Request 20.22% 1.764ms 20.22% 1.764ms 1.764ms 60.576us 0.92% 60.576us 60.576us 1 - aten::transpose 0.62% 53.860us 0.81% 70.972us 2.957us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.20% 17.112us 0.20% 17.112us 0.713us 0.000us 0.00% 0.000us 0.000us 24 - aten::empty_like 0.15% 12.910us 0.64% 55.601us 6.178us 0.000us 0.00% 0.000us 0.000us 9 - aten::empty 0.82% 71.503us 0.82% 71.503us 3.405us 0.000us 0.00% 0.000us 0.000us 21 - cudaLaunchKernel 4.30% 375.338us 4.30% 375.338us 31.278us 0.000us 0.00% 0.000us 0.000us 12 - cudaStreamIsCapturing 0.03% 2.571us 0.03% 2.571us 0.857us 0.000us 0.00% 0.000us 0.000us 3 - cudaFuncSetAttribute 0.03% 3.000us 0.03% 3.000us 1.000us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 68.81% 6.003ms 68.81% 6.003ms 6.003ms 0.000us 0.00% 0.000us 0.000us 1 + torch_mem_eff 2.83% 247.966us 30.03% 2.630ms 2.630ms 0.000us 0.00% 6.745ms 6.745ms 1 + torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.693ms 100.13% 6.693ms 6.693ms 1 + aten::scaled_dot_product_attention 0.19% 17.071us 1.57% 137.393us 45.798us 0.000us 0.00% 6.009ms 2.003ms 3 + aten::_scaled_dot_product_efficient_attention 0.21% 18.029us 1.37% 120.322us 40.107us 0.000us 0.00% 6.009ms 2.003ms 3 + aten::_efficient_attention_forward 0.30% 26.699us 0.92% 80.822us 26.941us 6.009ms 89.89% 6.009ms 2.003ms 3 +fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 6.009ms 89.89% 6.009ms 2.003ms 3 + aten::contiguous 0.09% 8.060us 25.13% 2.201ms 244.542us 0.000us 0.00% 736.293us 81.810us 9 + aten::clone 0.25% 21.768us 25.04% 2.193ms 243.646us 0.000us 0.00% 736.293us 81.810us 9 + aten::copy_ 0.76% 66.873us 24.16% 2.115ms 235.039us 675.652us 10.11% 736.293us 81.810us 9 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 675.652us 10.11% 675.652us 75.072us 9 + Activity Buffer Request 20.46% 1.792ms 20.46% 1.792ms 1.792ms 60.641us 0.91% 60.641us 60.641us 1 + aten::transpose 0.56% 48.641us 0.74% 65.181us 2.716us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.19% 16.540us 0.19% 16.540us 0.689us 0.000us 0.00% 0.000us 0.000us 24 + aten::empty_like 0.14% 12.261us 0.64% 55.702us 6.189us 0.000us 0.00% 0.000us 0.000us 9 + aten::empty 0.78% 68.633us 0.78% 68.633us 3.268us 0.000us 0.00% 0.000us 0.000us 21 + cudaLaunchKernel 3.20% 280.067us 3.20% 280.067us 23.339us 0.000us 0.00% 0.000us 0.000us 12 + cudaStreamIsCapturing 0.03% 2.620us 0.03% 2.620us 0.873us 0.000us 0.00% 0.000us 0.000us 3 + cudaFuncSetAttribute 0.03% 2.860us 0.03% 2.860us 0.953us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 69.97% 6.127ms 69.97% 6.127ms 6.127ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 8.725ms -Self CUDA time total: 6.584ms +Self CPU time total: 8.757ms +Self CUDA time total: 6.684ms impl wl p50(ms) ok -torch_mem_eff cuda_attn_L128_bfloat16 1.83 True -torch_mem_eff cuda_attn_L256_bfloat16 1.93 True -torch_mem_eff cuda_attn_L320_bfloat16 1.95 True -torch_mem_eff cuda_attn_L384_bfloat16 2.04 True -torch_mem_eff cuda_attn_L448_bfloat16 2.08 True -torch_mem_eff cuda_attn_L512_bfloat16 2.17 True +torch_mem_eff cuda_attn_L128_bfloat16 1.86 True +torch_mem_eff cuda_attn_L256_bfloat16 1.92 True +torch_mem_eff cuda_attn_L320_bfloat16 2.02 True +torch_mem_eff cuda_attn_L384_bfloat16 1.99 True +torch_mem_eff cuda_attn_L448_bfloat16 2.10 True +torch_mem_eff cuda_attn_L512_bfloat16 2.25 True
-
-
▶ UV Install Logs
- -

Artifacts:

attention.jsonl diff --git a/flash_attn/impls/sage_attention.html b/flash_attn/impls/sage_attention.html index 733dcf04d2a9ac16dbc04893917abf21bba0c97d..d73a02780a0d62d77b290c0b1384fb4908660003 100644 --- a/flash_attn/impls/sage_attention.html +++ b/flash_attn/impls/sage_attention.html @@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content { ▼ output ▶ uv-logs | -Cell: benchmark | 4.72s +Cell: benchmark | 4.95s | Raw @@ -3938,24 +3938,22 @@ Cell: benchmark | 4.72s
Running attention benchmark on cuda with 6 workloads.
 impl                     wl                  p50(ms)  ok
 sage_int8_fp16           cuda_attn_L128_bfloat16    FAIL  False
-  Error: module 'sage_attention_d202bc414c936d8' has no attribute 'fwd'
+  Error: module 'sage_attention_b91c5fb7ee1dcfba' has no attribute 'fwd'
 sage_int8_fp16           cuda_attn_L256_bfloat16    FAIL  False
-  Error: module 'sage_attention_d202bc414c936d8' has no attribute 'fwd'
+  Error: module 'sage_attention_b91c5fb7ee1dcfba' has no attribute 'fwd'
 sage_int8_fp16           cuda_attn_L320_bfloat16    FAIL  False
-  Error: module 'sage_attention_d202bc414c936d8' has no attribute 'fwd'
+  Error: module 'sage_attention_b91c5fb7ee1dcfba' has no attribute 'fwd'
 sage_int8_fp16           cuda_attn_L384_bfloat16    FAIL  False
-  Error: module 'sage_attention_d202bc414c936d8' has no attribute 'fwd'
+  Error: module 'sage_attention_b91c5fb7ee1dcfba' has no attribute 'fwd'
 sage_int8_fp16           cuda_attn_L448_bfloat16    FAIL  False
-  Error: module 'sage_attention_d202bc414c936d8' has no attribute 'fwd'
+  Error: module 'sage_attention_b91c5fb7ee1dcfba' has no attribute 'fwd'
 sage_int8_fp16           cuda_attn_L512_bfloat16    FAIL  False
-  Error: module 'sage_attention_d202bc414c936d8' has no attribute 'fwd'
+  Error: module 'sage_attention_b91c5fb7ee1dcfba' has no attribute 'fwd'
 
-Fetching 8 files: 0%| | 0/8 [00:00<?, ?it/s]Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads. - -Fetching 8 files: 12%|█▎ | 1/8 [00:00<00:00, 7.67it/s] -Fetching 8 files: 38%|███▊ | 3/8 [00:00<00:01, 3.86it/s] -Fetching 8 files: 100%|██████████| 8/8 [00:00<00:00, 10.82it/s] +Fetching 8 files: 0%| | 0/8 [00:00<?, ?it/s] +Fetching 8 files: 38%|███▊ | 3/8 [00:00<00:01, 3.95it/s] +Fetching 8 files: 100%|██████████| 8/8 [00:00<00:00, 10.53it/s]

Artifacts:

diff --git a/flash_attn/impls/xformers.html b/flash_attn/impls/xformers.html index 546a7841d190ab8839b211aec0bb46dad7681ca7..4fd2d9c97dddc2c5eb3a85e3fc00916923b99a94 100644 --- a/flash_attn/impls/xformers.html +++ b/flash_attn/impls/xformers.html @@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content { ▼ output ▶ uv-logs | -Cell: benchmark | 5.49s +Cell: benchmark | 5.67s | Raw @@ -3940,21 +3940,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L128_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - xformers_meff 9.60% 449.460us 54.45% 2.550ms 2.550ms 0.000us 0.00% 3.540ms 3.540ms 1 - xformers_flash3::flash_fwd 4.00% 187.356us 44.14% 2.067ms 689.137us 0.000us 0.00% 3.540ms 1.180ms 3 - flash_attn_3::fwd 1.48% 69.234us 40.14% 1.880ms 626.685us 2.646ms 100.00% 3.540ms 1.180ms 3 - xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.648ms 100.06% 2.648ms 2.648ms 1 -void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.646ms 100.00% 2.646ms 882.010us 3 - Activity Buffer Request 36.74% 1.721ms 36.74% 1.721ms 1.721ms 894.309us 33.80% 894.309us 894.309us 1 - aten::empty 0.73% 34.410us 0.73% 34.410us 5.735us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.25% 11.780us 0.25% 11.780us 3.927us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 0.93% 43.670us 0.93% 43.670us 14.557us 0.000us 0.00% 0.000us 0.000us 3 - aten::reshape 0.24% 11.301us 0.72% 33.571us 5.595us 0.000us 0.00% 0.000us 0.000us 6 - aten::view 0.48% 22.270us 0.48% 22.270us 3.712us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 45.55% 2.133ms 45.55% 2.133ms 2.133ms 0.000us 0.00% 0.000us 0.000us 1 + xformers_meff 9.78% 468.612us 53.77% 2.576ms 2.576ms 0.000us 0.00% 3.664ms 3.664ms 1 + xformers_flash3::flash_fwd 4.05% 193.923us 43.19% 2.069ms 689.708us 0.000us 0.00% 3.664ms 1.221ms 3 + flash_attn_3::fwd 1.52% 72.582us 39.15% 1.875ms 625.067us 2.752ms 100.00% 3.664ms 1.221ms 3 + xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.754ms 100.05% 2.754ms 2.754ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.752ms 100.00% 2.752ms 917.464us 3 + Activity Buffer Request 35.57% 1.704ms 35.57% 1.704ms 1.704ms 911.394us 33.11% 911.394us 911.394us 1 + aten::empty 0.91% 43.821us 0.91% 43.821us 7.304us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.25% 12.121us 0.25% 12.121us 4.040us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 0.89% 42.701us 0.89% 42.701us 14.234us 0.000us 0.00% 0.000us 0.000us 3 + aten::reshape 0.31% 15.029us 0.79% 38.050us 6.342us 0.000us 0.00% 0.000us 0.000us 6 + aten::view 0.48% 23.021us 0.48% 23.021us 3.837us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 46.23% 2.215ms 46.23% 2.215ms 2.215ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 4.684ms -Self CUDA time total: 2.646ms +Self CPU time total: 4.790ms +Self CUDA time total: 2.752ms @@ -3964,21 +3964,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L256_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - xformers_meff 6.53% 314.780us 50.80% 2.448ms 2.448ms 0.000us 0.00% 3.745ms 3.745ms 1 - xformers_flash3::flash_fwd 2.99% 144.051us 43.78% 2.110ms 703.226us 0.000us 0.00% 3.745ms 1.248ms 3 - flash_attn_3::fwd 1.06% 51.161us 40.79% 1.966ms 655.209us 2.793ms 100.00% 3.745ms 1.248ms 3 - xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.795ms 100.06% 2.795ms 2.795ms 1 -void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.793ms 100.00% 2.793ms 931.037us 3 - Activity Buffer Request 38.27% 1.844ms 38.27% 1.844ms 1.844ms 952.158us 34.09% 952.158us 952.158us 1 - aten::empty 0.59% 28.641us 0.59% 28.641us 4.774us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.11% 5.380us 0.11% 5.380us 1.793us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 0.75% 36.051us 0.75% 36.051us 12.017us 0.000us 0.00% 0.000us 0.000us 3 - aten::reshape 0.19% 9.170us 0.49% 23.510us 3.918us 0.000us 0.00% 0.000us 0.000us 6 - aten::view 0.30% 14.340us 0.30% 14.340us 2.390us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 49.20% 2.371ms 49.20% 2.371ms 2.371ms 0.000us 0.00% 0.000us 0.000us 1 + xformers_meff 6.55% 315.485us 49.52% 2.386ms 2.386ms 0.000us 0.00% 3.791ms 3.791ms 1 + xformers_flash3::flash_fwd 2.94% 141.873us 42.50% 2.048ms 682.535us 0.000us 0.00% 3.791ms 1.264ms 3 + flash_attn_3::fwd 1.10% 52.803us 39.56% 1.906ms 635.244us 2.857ms 100.00% 3.791ms 1.264ms 3 + xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.858ms 100.05% 2.858ms 2.858ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.857ms 100.00% 2.857ms 952.327us 3 + Activity Buffer Request 37.05% 1.785ms 37.05% 1.785ms 1.785ms 933.660us 32.68% 933.660us 933.660us 1 + aten::empty 0.60% 29.019us 0.60% 29.019us 4.837us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.12% 5.710us 0.12% 5.710us 1.903us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 0.69% 33.350us 0.69% 33.350us 11.117us 0.000us 0.00% 0.000us 0.000us 3 + aten::reshape 0.18% 8.801us 0.47% 22.752us 3.792us 0.000us 0.00% 0.000us 0.000us 6 + aten::view 0.29% 13.951us 0.29% 13.951us 2.325us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 50.48% 2.432ms 50.48% 2.432ms 2.432ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 4.819ms -Self CUDA time total: 2.793ms +Self CPU time total: 4.818ms +Self CUDA time total: 2.857ms @@ -3988,21 +3988,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L320_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - xformers_meff 6.41% 306.378us 48.23% 2.306ms 2.306ms 0.000us 0.00% 3.879ms 3.879ms 1 - xformers_flash3::flash_fwd 2.97% 141.954us 41.36% 1.977ms 659.046us 0.000us 0.00% 3.879ms 1.293ms 3 - flash_attn_3::fwd 1.09% 51.910us 38.39% 1.835ms 611.728us 2.892ms 100.00% 3.879ms 1.293ms 3 - xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.893ms 100.06% 2.893ms 2.893ms 1 -void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.892ms 100.00% 2.892ms 963.882us 3 - Activity Buffer Request 35.83% 1.713ms 35.83% 1.713ms 1.713ms 986.975us 34.13% 986.975us 986.975us 1 - aten::empty 0.60% 28.840us 0.60% 28.840us 4.807us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.11% 5.330us 0.11% 5.330us 1.777us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 0.75% 36.082us 0.75% 36.082us 12.027us 0.000us 0.00% 0.000us 0.000us 3 - aten::reshape 0.17% 8.059us 0.47% 22.400us 3.733us 0.000us 0.00% 0.000us 0.000us 6 - aten::view 0.30% 14.341us 0.30% 14.341us 2.390us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 51.77% 2.475ms 51.77% 2.475ms 2.475ms 0.000us 0.00% 0.000us 0.000us 1 + xformers_meff 6.44% 303.576us 47.74% 2.252ms 2.252ms 0.000us 0.00% 3.845ms 3.845ms 1 + xformers_flash3::flash_fwd 3.02% 142.344us 40.83% 1.926ms 641.984us 0.000us 0.00% 3.845ms 1.282ms 3 + flash_attn_3::fwd 1.11% 52.511us 37.81% 1.784ms 594.536us 2.878ms 100.00% 3.845ms 1.282ms 3 + xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.880ms 100.05% 2.880ms 2.880ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.878ms 100.00% 2.878ms 959.487us 3 + Activity Buffer Request 35.25% 1.663ms 35.25% 1.663ms 1.663ms 967.007us 33.59% 967.007us 967.007us 1 + aten::empty 0.62% 29.170us 0.62% 29.170us 4.862us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.11% 5.320us 0.11% 5.320us 1.773us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 0.72% 33.781us 0.72% 33.781us 11.260us 0.000us 0.00% 0.000us 0.000us 3 + aten::reshape 0.18% 8.350us 0.47% 21.990us 3.665us 0.000us 0.00% 0.000us 0.000us 6 + aten::view 0.29% 13.640us 0.29% 13.640us 2.273us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 52.26% 2.465ms 52.26% 2.465ms 2.465ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 4.781ms -Self CUDA time total: 2.892ms +Self CPU time total: 4.717ms +Self CUDA time total: 2.878ms @@ -4012,21 +4012,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L384_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - xformers_meff 6.14% 305.279us 50.00% 2.487ms 2.487ms 0.000us 0.00% 3.889ms 3.889ms 1 - xformers_flash3::flash_fwd 2.94% 146.052us 43.42% 2.159ms 719.674us 0.000us 0.00% 3.889ms 1.296ms 3 - flash_attn_3::fwd 1.05% 52.012us 40.48% 2.013ms 670.990us 2.906ms 100.00% 3.889ms 1.296ms 3 - xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.907ms 100.06% 2.907ms 2.907ms 1 -void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.906ms 100.00% 2.906ms 968.605us 3 - Activity Buffer Request 34.76% 1.728ms 34.76% 1.728ms 1.728ms 983.453us 33.84% 983.453us 983.453us 1 - aten::empty 0.63% 31.322us 0.63% 31.322us 5.220us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.11% 5.389us 0.11% 5.389us 1.796us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 3.94% 195.844us 3.94% 195.844us 65.281us 0.000us 0.00% 0.000us 0.000us 3 - aten::reshape 0.17% 8.560us 0.45% 22.331us 3.722us 0.000us 0.00% 0.000us 0.000us 6 - aten::view 0.28% 13.771us 0.28% 13.771us 2.295us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 50.00% 2.486ms 50.00% 2.486ms 2.486ms 0.000us 0.00% 0.000us 0.000us 1 + xformers_meff 6.01% 303.306us 50.06% 2.525ms 2.525ms 0.000us 0.00% 3.923ms 3.923ms 1 + xformers_flash3::flash_fwd 2.90% 146.364us 43.59% 2.199ms 733.113us 0.000us 0.00% 3.923ms 1.308ms 3 + flash_attn_3::fwd 1.02% 51.431us 40.69% 2.053ms 684.325us 2.938ms 100.00% 3.923ms 1.308ms 3 + xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.939ms 100.05% 2.939ms 2.939ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.938ms 100.00% 2.938ms 979.195us 3 + Activity Buffer Request 34.86% 1.758ms 34.86% 1.758ms 1.758ms 985.691us 33.55% 985.691us 985.691us 1 + aten::empty 0.57% 28.860us 0.57% 28.860us 4.810us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.11% 5.561us 0.11% 5.561us 1.854us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 4.14% 208.674us 4.14% 208.674us 69.558us 0.000us 0.00% 0.000us 0.000us 3 + aten::reshape 0.18% 9.230us 0.45% 22.800us 3.800us 0.000us 0.00% 0.000us 0.000us 6 + aten::view 0.27% 13.570us 0.27% 13.570us 2.262us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 49.94% 2.520ms 49.94% 2.520ms 2.520ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 4.973ms -Self CUDA time total: 2.906ms +Self CPU time total: 5.045ms +Self CUDA time total: 2.938ms @@ -4036,21 +4036,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L448_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - xformers_meff 5.54% 306.968us 45.05% 2.496ms 2.496ms 0.000us 0.00% 4.618ms 4.618ms 1 - xformers_flash3::flash_fwd 2.62% 145.024us 39.11% 2.167ms 722.434us 0.000us 0.00% 4.618ms 1.539ms 3 - flash_attn_3::fwd 0.92% 51.181us 36.50% 2.022ms 674.093us 3.463ms 100.00% 4.618ms 1.539ms 3 - xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.465ms 100.05% 3.465ms 3.465ms 1 -void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.463ms 100.00% 3.463ms 1.154ms 3 - Activity Buffer Request 31.42% 1.741ms 31.42% 1.741ms 1.741ms 1.155ms 33.34% 1.155ms 1.155ms 1 - aten::empty 0.54% 29.990us 0.54% 29.990us 4.998us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.10% 5.350us 0.10% 5.350us 1.783us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 3.51% 194.715us 3.51% 194.715us 64.905us 0.000us 0.00% 0.000us 0.000us 3 - aten::reshape 0.15% 8.420us 0.40% 22.040us 3.673us 0.000us 0.00% 0.000us 0.000us 6 - aten::view 0.25% 13.620us 0.25% 13.620us 2.270us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 54.95% 3.045ms 54.95% 3.045ms 3.045ms 0.000us 0.00% 0.000us 0.000us 1 + xformers_meff 5.53% 307.446us 44.37% 2.468ms 2.468ms 0.000us 0.00% 4.694ms 4.694ms 1 + xformers_flash3::flash_fwd 2.65% 147.575us 38.45% 2.139ms 712.966us 0.000us 0.00% 4.694ms 1.565ms 3 + flash_attn_3::fwd 0.89% 49.519us 35.79% 1.991ms 663.774us 3.515ms 100.00% 4.694ms 1.565ms 3 + xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.517ms 100.05% 3.517ms 3.517ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.515ms 100.00% 3.515ms 1.172ms 3 + Activity Buffer Request 30.66% 1.706ms 30.66% 1.706ms 1.706ms 1.179ms 33.55% 1.179ms 1.179ms 1 + aten::empty 0.52% 28.861us 0.52% 28.861us 4.810us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.11% 6.000us 0.11% 6.000us 2.000us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 3.61% 201.015us 3.61% 201.015us 67.005us 0.000us 0.00% 0.000us 0.000us 3 + aten::reshape 0.15% 8.290us 0.39% 21.930us 3.655us 0.000us 0.00% 0.000us 0.000us 6 + aten::view 0.25% 13.640us 0.25% 13.640us 2.273us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 55.63% 3.095ms 55.63% 3.095ms 3.095ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 5.541ms -Self CUDA time total: 3.463ms +Self CPU time total: 5.563ms +Self CUDA time total: 3.515ms @@ -4060,37 +4060,37 @@ PROFILE TRACE: xformers_meff | cuda_attn_L512_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - xformers_meff 5.16% 304.966us 48.93% 2.893ms 2.893ms 0.000us 0.00% 4.598ms 4.598ms 1 - xformers_flash3::flash_fwd 9.37% 553.844us 43.37% 2.564ms 854.584us 0.000us 0.00% 4.598ms 1.533ms 3 - flash_attn_3::fwd 0.88% 52.300us 34.00% 2.010ms 669.970us 3.443ms 100.00% 4.598ms 1.533ms 3 - xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.445ms 100.05% 3.445ms 3.445ms 1 -void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.443ms 100.00% 3.443ms 1.148ms 3 - Activity Buffer Request 28.71% 1.697ms 28.71% 1.697ms 1.697ms 1.155ms 33.53% 1.155ms 1.155ms 1 - aten::empty 0.52% 30.653us 0.52% 30.653us 5.109us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.09% 5.400us 0.09% 5.400us 1.800us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 3.80% 224.365us 3.80% 224.365us 74.788us 0.000us 0.00% 0.000us 0.000us 3 - aten::reshape 0.15% 8.918us 0.40% 23.921us 3.987us 0.000us 0.00% 0.000us 0.000us 6 - aten::view 0.25% 15.003us 0.25% 15.003us 2.501us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 51.07% 3.019ms 51.07% 3.019ms 3.019ms 0.000us 0.00% 0.000us 0.000us 1 + xformers_meff 5.46% 305.147us 45.13% 2.521ms 2.521ms 0.000us 0.00% 4.658ms 4.658ms 1 + xformers_flash3::flash_fwd 2.65% 147.824us 39.28% 2.194ms 731.306us 0.000us 0.00% 4.658ms 1.553ms 3 + flash_attn_3::fwd 0.94% 52.350us 36.63% 2.046ms 682.031us 3.488ms 100.00% 4.658ms 1.553ms 3 + xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.489ms 100.05% 3.489ms 3.489ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.488ms 100.00% 3.488ms 1.163ms 3 + Activity Buffer Request 31.45% 1.757ms 31.45% 1.757ms 1.757ms 1.171ms 33.57% 1.171ms 1.171ms 1 + aten::empty 0.54% 29.960us 0.54% 29.960us 4.993us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.10% 5.370us 0.10% 5.370us 1.790us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 3.61% 201.885us 3.61% 201.885us 67.295us 0.000us 0.00% 0.000us 0.000us 3 + aten::reshape 0.15% 8.170us 0.39% 21.900us 3.650us 0.000us 0.00% 0.000us 0.000us 6 + aten::view 0.25% 13.730us 0.25% 13.730us 2.288us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 54.87% 3.065ms 54.87% 3.065ms 3.065ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 5.912ms -Self CUDA time total: 3.443ms +Self CPU time total: 5.586ms +Self CUDA time total: 3.488ms impl wl p50(ms) ok -xformers_meff cuda_attn_L128_bfloat16 0.98 True +xformers_meff cuda_attn_L128_bfloat16 0.99 True xformers_meff cuda_attn_L256_bfloat16 1.04 True -xformers_meff cuda_attn_L320_bfloat16 1.06 True -xformers_meff cuda_attn_L384_bfloat16 1.09 True +xformers_meff cuda_attn_L320_bfloat16 1.07 True +xformers_meff cuda_attn_L384_bfloat16 1.08 True xformers_meff cuda_attn_L448_bfloat16 1.26 True -xformers_meff cuda_attn_L512_bfloat16 1.24 True +xformers_meff cuda_attn_L512_bfloat16 1.25 True
▶ UV Install Logs
diff --git a/flash_attn/results/artifacts/combine/latency.svg b/flash_attn/results/artifacts/combine/latency.svg index 8800d95ccf9593243148bb665ba5de1acc0ab0bb..fb5eb60d205faf06eb83825655f2fdb7f3acdcc6 100644 --- a/flash_attn/results/artifacts/combine/latency.svg +++ b/flash_attn/results/artifacts/combine/latency.svg @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0f160f3f11d41b3a388cb9ab3a3ed23dc9ca473cb8531e7d3dc53c94cc97ebd0 -size 24778 +oid sha256:4c6091c233ea3ade4488d4a47b74639c4507900bcc055a5d5ec3d4f9d3262f2b +size 24785 diff --git a/flash_attn/results/combined_results.html b/flash_attn/results/combined_results.html index d64bdc9d89db4906408cc054a8f0e113b97f1a4b..b2173dcc03da5f1b475a08b7d7390cace1929cf5 100644 --- a/flash_attn/results/combined_results.html +++ b/flash_attn/results/combined_results.html @@ -3889,7 +3889,7 @@ body[data-tool="eraser"] .main-content { - 2025-12-19T19:55:48.469348 + 2025-12-19T23:02:45.375383 image/svg+xml @@ -3999,96 +3999,96 @@ body[data-tool="eraser"] .main-content { - + - + - 1.0 + 1.0 - + - + - 1.2 + 1.2 - + - + - 1.4 + 1.4 - + - + - 1.6 + 1.6 - + - + - 1.8 + 1.8 - + - + - 2.0 + 2.0 - + - + - 2.2 + 2.2 @@ -4096,73 +4096,73 @@ body[data-tool="eraser"] .main-content { - + - - - - - - + + + + + + - + - - - - - + + + + + - + - - - - - - + + + + + + - + - - - - - - + + + + + + - + - - - - - + + + + + @@ -4247,7 +4247,7 @@ body[data-tool="eraser"] .main-content { ▼ output ▶ uv-logs | -Cell: combine | 4.68s +Cell: combine | 4.45s | Raw @@ -4356,48 +4356,48 @@ Summary: 6 found, 0 skipped, 0 missing COMBINED BENCHMARK SUMMARY impl wl p50(ms) ok -hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.95 True -hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.00 True -hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.04 True -hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.06 True -hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.22 True -hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.24 True -hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.91 True +hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.96 True +hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.01 True +hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.06 True +hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.08 True +hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.24 True +hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.25 True +hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.93 True hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.98 True hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.01 True -hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.01 True -hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.18 True -hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.17 True +hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.02 True +hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.19 True +hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.22 True sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False - Error: module 'sage_attention_d202bc414c936d8' has no attribute 'fwd' + Error: module 'sage_attention_b91c5fb7ee1dcfba' has no attribute 'fwd' sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False - Error: module 'sage_attention_d202bc414c936d8' has no attribute 'fwd' + Error: module 'sage_attention_b91c5fb7ee1dcfba' has no attribute 'fwd' sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False - Error: module 'sage_attention_d202bc414c936d8' has no attribute 'fwd' + Error: module 'sage_attention_b91c5fb7ee1dcfba' has no attribute 'fwd' sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False - Error: module 'sage_attention_d202bc414c936d8' has no attribute 'fwd' + Error: module 'sage_attention_b91c5fb7ee1dcfba' has no attribute 'fwd' sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False - Error: module 'sage_attention_d202bc414c936d8' has no attribute 'fwd' + Error: module 'sage_attention_b91c5fb7ee1dcfba' has no attribute 'fwd' sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False - Error: module 'sage_attention_d202bc414c936d8' has no attribute 'fwd' + Error: module 'sage_attention_b91c5fb7ee1dcfba' has no attribute 'fwd' torch_flash_ma cuda_attn_L128_bfloat16 1.21 True -torch_flash_ma cuda_attn_L256_bfloat16 1.25 True -torch_flash_ma cuda_attn_L320_bfloat16 1.28 True -torch_flash_ma cuda_attn_L384_bfloat16 1.31 True -torch_flash_ma cuda_attn_L448_bfloat16 1.45 True -torch_flash_ma cuda_attn_L512_bfloat16 1.49 True -torch_mem_eff cuda_attn_L128_bfloat16 1.83 True -torch_mem_eff cuda_attn_L256_bfloat16 1.93 True -torch_mem_eff cuda_attn_L320_bfloat16 1.95 True -torch_mem_eff cuda_attn_L384_bfloat16 2.04 True -torch_mem_eff cuda_attn_L448_bfloat16 2.08 True -torch_mem_eff cuda_attn_L512_bfloat16 2.17 True -xformers_meff cuda_attn_L128_bfloat16 0.98 True +torch_flash_ma cuda_attn_L256_bfloat16 1.28 True +torch_flash_ma cuda_attn_L320_bfloat16 1.30 True +torch_flash_ma cuda_attn_L384_bfloat16 1.33 True +torch_flash_ma cuda_attn_L448_bfloat16 1.49 True +torch_flash_ma cuda_attn_L512_bfloat16 1.52 True +torch_mem_eff cuda_attn_L128_bfloat16 1.86 True +torch_mem_eff cuda_attn_L256_bfloat16 1.92 True +torch_mem_eff cuda_attn_L320_bfloat16 2.02 True +torch_mem_eff cuda_attn_L384_bfloat16 1.99 True +torch_mem_eff cuda_attn_L448_bfloat16 2.10 True +torch_mem_eff cuda_attn_L512_bfloat16 2.25 True +xformers_meff cuda_attn_L128_bfloat16 0.99 True xformers_meff cuda_attn_L256_bfloat16 1.04 True -xformers_meff cuda_attn_L320_bfloat16 1.06 True -xformers_meff cuda_attn_L384_bfloat16 1.09 True +xformers_meff cuda_attn_L320_bfloat16 1.07 True +xformers_meff cuda_attn_L384_bfloat16 1.08 True xformers_meff cuda_attn_L448_bfloat16 1.26 True -xformers_meff cuda_attn_L512_bfloat16 1.24 True +xformers_meff cuda_attn_L512_bfloat16 1.25 True GENERATING COMBINED VISUALIZATION @@ -4421,7 +4421,7 @@ Implementations included:
▶ UV Install Logs
@@ -4434,7 +4434,7 @@ Installed 37 packages in 206ms - 2025-12-19T19:55:48.469348 + 2025-12-19T23:02:45.375383 image/svg+xml @@ -4544,96 +4544,96 @@ Installed 37 packages in 206ms - + - + - 1.0 + 1.0 - + - + - 1.2 + 1.2 - + - + - 1.4 + 1.4 - + - + - 1.6 + 1.6 - + - + - 1.8 + 1.8 - + - + - 2.0 + 2.0 - + - + - 2.2 + 2.2 @@ -4641,73 +4641,73 @@ Installed 37 packages in 206ms - + - - - - - - + + + + + + - + - - - - - + + + + + - + - - - - - - + + + + + + - + - - - - - - + + + + + + - + - - - - - + + + + + diff --git a/index.html b/index.html index 4a9bcb16be660be60631823778cbdfacfa53438f..85b1c70826a1dbc66120a1230261edc2bb6d7d31 100644 --- a/index.html +++ b/index.html @@ -3874,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
Generated on:
- Linux x86_64 | Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35 + Darwin arm64 | macOS-15.7.2-arm64-arm-64bit
diff --git a/layer_norm/impls/artifacts/benchmark/layer_norm.jsonl b/layer_norm/impls/artifacts/benchmark/layer_norm.jsonl index cc2fd4ec90b5213ced430fa69e33f68723e0513f..7f7c840fb736fe5b9f0ff904d2d0452ff8ed9711 100644 --- a/layer_norm/impls/artifacts/benchmark/layer_norm.jsonl +++ b/layer_norm/impls/artifacts/benchmark/layer_norm.jsonl @@ -1,4 +1,4 @@ -{"ts": "2025-12-19T19:41:22Z", "run": "e4072b52508346c79afed4185ddfbd8a", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D4096", "batch": 16, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.8300210000129482, "p50": 0.8342720000200643, "p90": 0.83692099997279, "mean": 0.8337814000014987, "iqr": 0.006369000004724512, "raw_times": [0.8300210000129482, 0.83692099997279, 0.8342720000200643, 0.8305519999680655, 0.8371410000336255], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.8391320000100677, "peak_bytes": 2415935488, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null} -{"ts": "2025-12-19T19:41:22Z", "run": "e4072b52508346c79afed4185ddfbd8a", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D8192", "batch": 16, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6468019999820172, "p50": 1.6505419999930382, "p90": 1.6509619999851566, "mean": 1.650563999987753, "iqr": 0.0014700000292577897, "raw_times": [1.6509619999851566, 1.6505419999930382, 1.6468019999820172, 1.6494919999558988, 1.6550220000226545], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.6566229999739335, "peak_bytes": 4831870976, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1086463928222656e-05, "ref": "layer_norm_ref"}, "err": null} -{"ts": "2025-12-19T19:41:22Z", "run": "e4072b52508346c79afed4185ddfbd8a", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D4096", "batch": 16, "seq_len": 4096, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.643002000037086, "p50": 1.6493729999638163, "p90": 1.6502219999665613, "mean": 1.6475743999876613, "iqr": 0.0068199999532225775, "raw_times": [1.643002000037086, 1.6434020000133387, 1.6502219999665613, 1.6493729999638163, 1.6518729999575044], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.6485120000311326, "peak_bytes": 4831854592, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null} -{"ts": "2025-12-19T19:41:22Z", "run": "e4072b52508346c79afed4185ddfbd8a", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D8192", "batch": 16, "seq_len": 4096, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.2460340000284305, "p50": 3.2577039999637236, "p90": 3.260522999994464, "mean": 3.2551376000014898, "iqr": 0.011509999978898122, "raw_times": [3.260522999994464, 3.2460340000284305, 3.249013000015566, 3.2577039999637236, 3.262414000005265], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.2401029999959974, "peak_bytes": 9663709184, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1026859283447266e-05, "ref": "layer_norm_ref"}, "err": null} +{"ts": "2025-12-19T23:02:16Z", "run": "32d018bc53624a45997f9dda67216816", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D4096", "batch": 16, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.8274980000351206, "p50": 0.8322979999775271, "p90": 0.8378580000680813, "mean": 0.8332618000167713, "iqr": 0.0071710001066094264, "raw_times": [0.8322979999775271, 0.8378580000680813, 0.8379680000416556, 0.8306869999614719, 0.8274980000351206], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.8445380001376179, "peak_bytes": 2415935488, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null} +{"ts": "2025-12-19T23:02:16Z", "run": "32d018bc53624a45997f9dda67216816", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D8192", "batch": 16, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6395549998833303, "p50": 1.6463560000374855, "p90": 1.6514159999587719, "mean": 1.6487175999827741, "iqr": 0.00707099979990744, "raw_times": [1.6395549998833303, 1.6514159999587719, 1.6463560000374855, 1.6443450001588644, 1.6619159998754185], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.6726759999983187, "peak_bytes": 4831870976, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1086463928222656e-05, "ref": "layer_norm_ref"}, "err": null} +{"ts": "2025-12-19T23:02:16Z", "run": "32d018bc53624a45997f9dda67216816", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D4096", "batch": 16, "seq_len": 4096, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6412159998253628, "p50": 1.644736000116609, "p90": 1.6461760001220682, "mean": 1.6448379999474128, "iqr": 0.0036900003124173963, "raw_times": [1.644736000116609, 1.6412159998253628, 1.649575999863373, 1.6461760001220682, 1.6424859998096508], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.646575999984634, "peak_bytes": 4831854592, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null} +{"ts": "2025-12-19T23:02:17Z", "run": "32d018bc53624a45997f9dda67216816", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D8192", "batch": 16, "seq_len": 4096, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.2493999999587686, "p50": 3.2569499999226537, "p90": 3.2582300000285613, "mean": 3.2570102000136103, "iqr": 0.006920000032550888, "raw_times": [3.2493999999587686, 3.2691610001620575, 3.2513099999960104, 3.2569499999226537, 3.2582300000285613], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.2572910001817945, "peak_bytes": 9663709184, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1026859283447266e-05, "ref": "layer_norm_ref"}, "err": null} diff --git a/layer_norm/impls/hf_kernels_layer_norm.html b/layer_norm/impls/hf_kernels_layer_norm.html index e6a699d4a854162c9c9c93c67f1bbe2f21fe6eb5..cea85be31c9b73bfab4875ba24e3ca0dcdf69b2c 100644 --- a/layer_norm/impls/hf_kernels_layer_norm.html +++ b/layer_norm/impls/hf_kernels_layer_norm.html @@ -3889,7 +3889,7 @@ body[data-tool="eraser"] .main-content { ▼ output ▶ uv-logs | -Cell: benchmark | 6.61s +Cell: benchmark | 6.38s | Raw @@ -3961,19 +3961,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D4096 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_layer_norm 4.34% 182.243us 49.12% 2.065ms 2.065ms 0.000us 0.00% 3.103ms 3.103ms 1 - _layer_norm_f8ec252::dropout_add_ln_fwd 1.54% 64.542us 44.23% 1.860ms 619.846us 2.366ms 100.00% 3.103ms 1.034ms 3 - hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.367ms 100.06% 2.367ms 2.367ms 1 -void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 2.366ms 100.00% 2.366ms 788.551us 3 - Activity Buffer Request 40.34% 1.696ms 40.34% 1.696ms 1.696ms 737.372us 31.17% 737.372us 737.372us 1 - aten::view 0.55% 23.192us 0.55% 23.192us 3.865us 0.000us 0.00% 0.000us 0.000us 6 - aten::empty 1.11% 46.641us 1.11% 46.641us 5.182us 0.000us 0.00% 0.000us 0.000us 9 -cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.21% 8.950us 0.21% 8.950us 2.983us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 1.04% 43.741us 1.04% 43.741us 14.580us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 50.88% 2.139ms 50.88% 2.139ms 2.139ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_layer_norm 4.50% 190.523us 50.02% 2.118ms 2.118ms 0.000us 0.00% 3.104ms 3.104ms 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 1.66% 70.302us 44.96% 1.904ms 634.711us 2.362ms 100.00% 3.104ms 1.035ms 3 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.364ms 100.07% 2.364ms 2.364ms 1 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 2.362ms 100.00% 2.362ms 787.316us 3 + Activity Buffer Request 40.99% 1.736ms 40.99% 1.736ms 1.736ms 741.567us 31.40% 741.567us 741.567us 1 + aten::view 0.56% 23.541us 0.56% 23.541us 3.923us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 1.07% 45.480us 1.07% 45.480us 5.053us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.24% 10.011us 0.24% 10.011us 3.337us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 1.01% 42.571us 1.01% 42.571us 14.190us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 49.98% 2.117ms 49.98% 2.117ms 2.117ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 4.204ms -Self CUDA time total: 2.366ms +Self CPU time total: 4.235ms +Self CUDA time total: 2.362ms @@ -3983,19 +3983,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D8192 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_layer_norm 2.14% 142.004us 28.99% 1.924ms 1.924ms 0.000us 0.00% 6.477ms 6.477ms 1 - _layer_norm_f8ec252::dropout_add_ln_fwd 0.66% 43.639us 26.68% 1.771ms 590.278us 4.886ms 100.00% 6.477ms 2.159ms 3 - hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.887ms 100.03% 4.887ms 4.887ms 1 -void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.886ms 100.00% 4.886ms 1.629ms 3 - Activity Buffer Request 25.01% 1.660ms 25.01% 1.660ms 1.660ms 1.591ms 32.57% 1.591ms 1.591ms 1 - aten::view 0.17% 11.341us 0.17% 11.341us 1.890us 0.000us 0.00% 0.000us 0.000us 6 - aten::empty 0.47% 31.442us 0.47% 31.442us 3.494us 0.000us 0.00% 0.000us 0.000us 9 -cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.07% 4.640us 0.07% 4.640us 1.547us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 0.46% 30.730us 0.46% 30.730us 10.243us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 71.01% 4.714ms 71.01% 4.714ms 4.714ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_layer_norm 2.21% 144.492us 28.97% 1.894ms 1.894ms 0.000us 0.00% 6.395ms 6.395ms 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 0.69% 45.222us 26.58% 1.738ms 579.353us 4.814ms 100.00% 6.395ms 2.132ms 3 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.816ms 100.03% 4.816ms 4.816ms 1 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.814ms 100.00% 4.814ms 1.605ms 3 + Activity Buffer Request 24.92% 1.629ms 24.92% 1.629ms 1.629ms 1.581ms 32.84% 1.581ms 1.581ms 1 + aten::view 0.18% 11.541us 0.18% 11.541us 1.923us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 0.45% 29.440us 0.45% 29.440us 3.271us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.08% 5.060us 0.08% 5.060us 1.687us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 0.45% 29.150us 0.45% 29.150us 9.717us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 71.03% 4.644ms 71.03% 4.644ms 4.644ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 6.638ms -Self CUDA time total: 4.886ms +Self CPU time total: 6.538ms +Self CUDA time total: 4.814ms @@ -4005,19 +4005,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S4096_D4096 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_layer_norm 1.93% 128.176us 30.23% 2.007ms 2.007ms 0.000us 0.00% 6.371ms 6.371ms 1 - _layer_norm_f8ec252::dropout_add_ln_fwd 0.67% 44.789us 28.12% 1.867ms 622.462us 4.799ms 100.00% 6.371ms 2.124ms 3 - hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.800ms 100.03% 4.800ms 4.800ms 1 -void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.799ms 100.00% 4.799ms 1.600ms 3 - Activity Buffer Request 26.44% 1.756ms 26.44% 1.756ms 1.756ms 1.572ms 32.76% 1.572ms 1.572ms 1 - aten::view 0.18% 11.888us 0.18% 11.888us 1.981us 0.000us 0.00% 0.000us 0.000us 6 - aten::empty 0.47% 31.493us 0.47% 31.493us 3.499us 0.000us 0.00% 0.000us 0.000us 9 -cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.07% 4.790us 0.07% 4.790us 1.597us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 0.46% 30.490us 0.46% 30.490us 10.163us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 69.77% 4.633ms 69.77% 4.633ms 4.633ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_layer_norm 2.04% 135.241us 30.10% 1.992ms 1.992ms 0.000us 0.00% 6.361ms 6.361ms 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 0.68% 45.331us 27.89% 1.846ms 615.254us 4.793ms 100.00% 6.361ms 2.120ms 3 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.794ms 100.03% 4.794ms 4.794ms 1 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.793ms 100.00% 4.793ms 1.598ms 3 + Activity Buffer Request 26.25% 1.737ms 26.25% 1.737ms 1.737ms 1.569ms 32.73% 1.569ms 1.569ms 1 + aten::view 0.17% 11.061us 0.17% 11.061us 1.844us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 0.44% 29.151us 0.44% 29.151us 3.239us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.09% 5.831us 0.09% 5.831us 1.944us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 0.43% 28.320us 0.43% 28.320us 9.440us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 69.90% 4.626ms 69.90% 4.626ms 4.626ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 6.641ms -Self CUDA time total: 4.799ms +Self CPU time total: 6.618ms +Self CUDA time total: 4.793ms @@ -4027,40 +4027,37 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S4096_D8192 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_layer_norm 1.63% 190.425us 19.77% 2.315ms 2.315ms 0.000us 0.00% 12.766ms 12.766ms 1 - _layer_norm_f8ec252::dropout_add_ln_fwd 0.56% 65.132us 17.99% 2.107ms 702.188us 9.610ms 100.00% 12.766ms 4.255ms 3 - hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.611ms 100.01% 9.611ms 9.611ms 1 -void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 9.610ms 100.00% 9.610ms 3.203ms 3 - Activity Buffer Request 14.43% 1.690ms 14.43% 1.690ms 1.690ms 3.156ms 32.84% 3.156ms 3.156ms 1 - aten::view 0.16% 18.311us 0.16% 18.311us 3.052us 0.000us 0.00% 0.000us 0.000us 6 - aten::empty 0.27% 31.990us 0.27% 31.990us 3.554us 0.000us 0.00% 0.000us 0.000us 9 -cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.06% 6.981us 0.06% 6.981us 2.327us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 2.67% 312.827us 2.67% 312.827us 104.276us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 80.23% 9.393ms 80.23% 9.393ms 9.393ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_layer_norm 1.16% 134.713us 18.89% 2.202ms 2.202ms 0.000us 0.00% 12.808ms 12.808ms 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 0.38% 44.369us 17.64% 2.056ms 685.371us 9.627ms 100.00% 12.808ms 4.269ms 3 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.628ms 100.02% 9.628ms 9.628ms 1 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 9.627ms 100.00% 9.627ms 3.209ms 3 + Activity Buffer Request 14.91% 1.739ms 14.91% 1.739ms 1.739ms 3.182ms 33.05% 3.182ms 3.182ms 1 + aten::view 0.10% 11.381us 0.10% 11.381us 1.897us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 0.26% 29.940us 0.26% 29.940us 3.327us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.04% 4.960us 0.04% 4.960us 1.653us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 2.04% 237.996us 2.04% 237.996us 79.332us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 81.11% 9.457ms 81.11% 9.457ms 9.457ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 11.708ms -Self CUDA time total: 9.610ms +Self CPU time total: 11.659ms +Self CUDA time total: 9.627ms impl wl p50(ms) ok hf_kernels_layer_norm LN_B16_S2048_D4096 0.83 True hf_kernels_layer_norm LN_B16_S2048_D8192 1.65 True -hf_kernels_layer_norm LN_B16_S4096_D4096 1.65 True +hf_kernels_layer_norm LN_B16_S4096_D4096 1.64 True hf_kernels_layer_norm LN_B16_S4096_D8192 3.26 True
▶ UV Install Logs
-
Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads. - -Fetching 4 files: 25%|██▌ | 1/4 [00:00<00:00, 8.01it/s] -Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.00it/s] -Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.30it/s]
+
Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s] +Fetching 4 files: 25%|██▌ | 1/4 [00:00<00:00, 9.25it/s] +Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.14it/s] +Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.62it/s]

Artifacts:

layer_norm.jsonl diff --git a/layer_norm/impls/torch_layer_norm.html b/layer_norm/impls/torch_layer_norm.html index e7175a003e61d659f44d811561339bf761f4cbcc..4ae943a0d01b41172f166198d2ba692dc91d9aec 100644 --- a/layer_norm/impls/torch_layer_norm.html +++ b/layer_norm/impls/torch_layer_norm.html @@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content { ▼ output ▶ uv-logs | -Cell: nv | 0.30s +Cell: nv | 0.25s | Raw @@ -3904,7 +3904,7 @@ Cell: nv | 0.30s
-
Fri Dec 19 19:40:36 2025       
+
Fri Dec 19 22:48:33 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 580.105.08             Driver Version: 580.105.08     CUDA Version: 13.0     |
 +-----------------------------------------+------------------------+----------------------+
@@ -3913,7 +3913,7 @@ Cell: nv | 0.30s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   26C    P8             24W /  350W |       0MiB /  46068MiB |      0%      Default |
+| N/A   30C    P0            107W /  350W |       0MiB /  46068MiB |     68%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -3937,7 +3937,7 @@ Cell: nv | 0.30s
 ▼ output
  ▶ uv-logs
  | 
-Cell: benchmark | 32.13s
+Cell: benchmark | 7.61s
  | 
 
 Raw
@@ -3985,19 +3985,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S2048_D4096
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         2.46%     151.464us        66.01%       4.061ms       4.061ms       0.000us         0.00%       3.020ms       3.020ms             1  
-                                       aten::layer_norm         0.24%      14.681us        63.55%       3.910ms       1.303ms       0.000us         0.00%       3.020ms       1.007ms             3  
-                                aten::native_layer_norm        20.97%       1.290ms        63.31%       3.895ms       1.298ms       2.310ms       100.00%       3.020ms       1.007ms             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       2.311ms       100.06%       2.311ms       2.311ms             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       2.310ms       100.00%       2.310ms     770.057us             3  
-                                Activity Buffer Request        40.34%       2.482ms        40.34%       2.482ms       2.482ms     709.854us        30.73%     709.854us     709.854us             1  
-                                            aten::empty         1.09%      66.873us         1.09%      66.873us       7.430us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel         0.79%      48.731us         0.79%      48.731us      16.244us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.12%       7.460us         0.12%       7.460us       1.243us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        33.99%       2.091ms        33.99%       2.091ms       2.091ms       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         3.61%     151.022us        49.72%       2.081ms       2.081ms       0.000us         0.00%       3.037ms       3.037ms             1  
+                                       aten::layer_norm         0.35%      14.701us        46.11%       1.930ms     643.468us       0.000us         0.00%       3.037ms       1.012ms             3  
+                                aten::native_layer_norm         1.79%      75.131us        45.76%       1.916ms     638.567us       2.326ms       100.00%       3.037ms       1.012ms             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       2.327ms       100.06%       2.327ms       2.327ms             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       2.326ms       100.00%       2.326ms     775.187us             3  
+                                Activity Buffer Request        41.50%       1.738ms        41.50%       1.738ms       1.738ms     711.774us        30.61%     711.774us     711.774us             1  
+                                            aten::empty         1.17%      48.860us         1.17%      48.860us       5.429us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         1.12%      46.753us         1.12%      46.753us      15.584us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.18%       7.441us         0.18%       7.441us       1.240us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        50.28%       2.105ms        50.28%       2.105ms       2.105ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 6.152ms
-Self CUDA time total: 2.310ms
+Self CPU time total: 4.186ms
+Self CUDA time total: 2.326ms
 
 
 
@@ -4007,19 +4007,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S2048_D8192
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         1.07%      70.812us        28.19%       1.857ms       1.857ms       0.000us         0.00%       6.442ms       6.442ms             1  
-                                       aten::layer_norm         0.14%       9.000us        27.11%       1.786ms     595.403us       0.000us         0.00%       6.442ms       2.147ms             3  
-                                aten::native_layer_norm         0.75%      49.502us        26.98%       1.777ms     592.403us       4.862ms       100.00%       6.442ms       2.147ms             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       4.864ms       100.03%       4.864ms       4.864ms             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       4.862ms       100.00%       4.862ms       1.621ms             3  
-                                Activity Buffer Request        25.31%       1.667ms        25.31%       1.667ms       1.667ms       1.580ms        32.49%       1.580ms       1.580ms             1  
-                                            aten::empty         0.43%      28.150us         0.43%      28.150us       3.128us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel         0.44%      28.800us         0.44%      28.800us       9.600us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.06%       3.751us         0.06%       3.751us       0.625us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        71.81%       4.731ms        71.81%       4.731ms       4.731ms       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         1.05%      69.561us        28.39%       1.886ms       1.886ms       0.000us         0.00%       6.477ms       6.477ms             1  
+                                       aten::layer_norm         0.13%       8.670us        27.34%       1.816ms     605.463us       0.000us         0.00%       6.477ms       2.159ms             3  
+                                aten::native_layer_norm         0.77%      50.957us        27.21%       1.808ms     602.573us       4.891ms       100.00%       6.477ms       2.159ms             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       4.893ms       100.03%       4.893ms       4.893ms             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       4.891ms       100.00%       4.891ms       1.630ms             3  
+                                Activity Buffer Request        25.53%       1.696ms        25.53%       1.696ms       1.696ms       1.586ms        32.42%       1.586ms       1.586ms             1  
+                                            aten::empty         0.45%      29.753us         0.45%      29.753us       3.306us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         0.41%      27.542us         0.41%      27.542us       9.181us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.05%       3.522us         0.05%       3.522us       0.587us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        71.61%       4.758ms        71.61%       4.758ms       4.758ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 6.588ms
-Self CUDA time total: 4.862ms
+Self CPU time total: 6.643ms
+Self CUDA time total: 4.891ms
 
 
 
@@ -4029,19 +4029,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S4096_D4096
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         1.08%      70.451us        29.89%       1.957ms       1.957ms       0.000us         0.00%       6.239ms       6.239ms             1  
-                                       aten::layer_norm         0.13%       8.611us        28.81%       1.886ms     628.738us       0.000us         0.00%       6.239ms       2.080ms             3  
-                                aten::native_layer_norm         0.76%      49.870us        28.68%       1.878ms     625.867us       4.724ms       100.00%       6.239ms       2.080ms             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       4.726ms       100.03%       4.726ms       4.726ms             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       4.724ms       100.00%       4.724ms       1.575ms             3  
-                                Activity Buffer Request        26.98%       1.766ms        26.98%       1.766ms       1.766ms       1.515ms        32.08%       1.515ms       1.515ms             1  
-                                            aten::empty         0.45%      29.490us         0.45%      29.490us       3.277us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel         0.43%      27.941us         0.43%      27.941us       9.314us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.06%       4.101us         0.06%       4.101us       0.684us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        70.11%       4.590ms        70.11%       4.590ms       4.590ms       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         1.06%      68.562us        29.18%       1.889ms       1.889ms       0.000us         0.00%       6.234ms       6.234ms             1  
+                                       aten::layer_norm         0.14%       9.330us        28.12%       1.821ms     606.966us       0.000us         0.00%       6.234ms       2.078ms             3  
+                                aten::native_layer_norm         0.78%      50.590us        27.97%       1.812ms     603.856us       4.719ms       100.00%       6.234ms       2.078ms             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       4.721ms       100.03%       4.721ms       4.721ms             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       4.719ms       100.00%       4.719ms       1.573ms             3  
+                                Activity Buffer Request        26.26%       1.700ms        26.26%       1.700ms       1.700ms       1.515ms        32.11%       1.515ms       1.515ms             1  
+                                            aten::empty         0.44%      28.660us         0.44%      28.660us       3.184us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         0.43%      28.042us         0.43%      28.042us       9.347us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.06%       3.840us         0.06%       3.840us       0.640us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        70.82%       4.586ms        70.82%       4.586ms       4.586ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 6.547ms
-Self CUDA time total: 4.724ms
+Self CPU time total: 6.476ms
+Self CUDA time total: 4.719ms
 
 
 
@@ -4051,23 +4051,23 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S4096_D8192
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         0.65%      74.391us        15.11%       1.731ms       1.731ms       0.000us         0.00%      13.123ms      13.123ms             1  
-                                       aten::layer_norm         0.08%       9.310us        14.46%       1.656ms     552.093us       0.000us         0.00%      13.123ms       4.374ms             3  
-                                aten::native_layer_norm         0.45%      52.052us        14.38%       1.647ms     548.989us       9.864ms       100.00%      13.123ms       4.374ms             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       9.866ms       100.01%       9.866ms       9.866ms             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       9.864ms       100.00%       9.864ms       3.288ms             3  
-                                Activity Buffer Request        11.61%       1.330ms        11.61%       1.330ms       1.330ms       3.258ms        33.03%       3.258ms       3.258ms             1  
-                                            aten::empty         0.27%      31.120us         0.27%      31.120us       3.458us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel         2.01%     229.635us         2.01%     229.635us      76.545us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.04%       4.651us         0.04%       4.651us       0.775us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        84.89%       9.721ms        84.89%       9.721ms       9.721ms       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         0.64%      72.823us        14.96%       1.710ms       1.710ms       0.000us         0.00%      13.144ms      13.144ms             1  
+                                       aten::layer_norm         0.08%       8.940us        14.32%       1.637ms     545.678us       0.000us         0.00%      13.144ms       4.381ms             3  
+                                aten::native_layer_norm         0.49%      56.431us        14.24%       1.628ms     542.698us       9.871ms       100.00%      13.144ms       4.381ms             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       9.872ms       100.02%       9.872ms       9.872ms             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       9.871ms       100.00%       9.871ms       3.290ms             3  
+                                Activity Buffer Request        11.76%       1.344ms        11.76%       1.344ms       1.344ms       3.273ms        33.16%       3.273ms       3.273ms             1  
+                                            aten::empty         0.26%      29.920us         0.26%      29.920us       3.324us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         1.69%     193.294us         1.69%     193.294us      64.431us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.04%       4.390us         0.04%       4.390us       0.732us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        85.04%       9.722ms        85.04%       9.722ms       9.722ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 11.451ms
-Self CUDA time total: 9.864ms
+Self CPU time total: 11.432ms
+Self CUDA time total: 9.871ms
 
 
 impl                     wl                  p50(ms)  ok
-torch_layer_norm         LN_B16_S2048_D4096     0.81  True
+torch_layer_norm         LN_B16_S2048_D4096     0.82  True
 torch_layer_norm         LN_B16_S2048_D8192     1.68  True
 torch_layer_norm         LN_B16_S4096_D4096     1.61  True
 torch_layer_norm         LN_B16_S4096_D8192     3.32  True
@@ -4075,53 +4075,7 @@ torch_layer_norm         LN_B16_S4096_D8192     3.32  True
 
▶ UV Install Logs
diff --git a/layer_norm/results/artifacts/combine/latency.svg b/layer_norm/results/artifacts/combine/latency.svg index ca6e1d563b864065ab6e5cd1f4c45b888d285de6..3daf1e152c8628980aa60ab516fa412560a7168b 100644 --- a/layer_norm/results/artifacts/combine/latency.svg +++ b/layer_norm/results/artifacts/combine/latency.svg @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fa76da3cc0e8c6ec848648e3fa2d66315df4d6c7779fd0c7e2825d697af78f88 -size 14641 +oid sha256:a50c3bf38dd2b9e606d91842d6e16e6ad0750d6688e7a674eada63b34b0c39ce +size 14635 diff --git a/layer_norm/results/combined_results.html b/layer_norm/results/combined_results.html index 09dbeb416e41f9c54b369c7629f0cafd3d1178a9..aec39cf2d001e84fb54203d1db5baff01e9f5e1d 100644 --- a/layer_norm/results/combined_results.html +++ b/layer_norm/results/combined_results.html @@ -3889,7 +3889,7 @@ body[data-tool="eraser"] .main-content { - 2025-12-19T19:55:25.441156 + 2025-12-19T23:02:49.888978 image/svg+xml @@ -3973,70 +3973,70 @@ body[data-tool="eraser"] .main-content { - + - + - 1.0 + 1.0 - + - + - 1.5 + 1.5 - + - + - 2.0 + 2.0 - + - + - 2.5 + 2.5 - + - + - 3.0 + 3.0 @@ -4044,27 +4044,27 @@ body[data-tool="eraser"] .main-content { - + - - + + - + - - - - + + + + @@ -4122,7 +4122,7 @@ body[data-tool="eraser"] .main-content { ▼ output ▶ uv-logs | -Cell: combine | 4.63s +Cell: combine | 4.43s | Raw @@ -4211,9 +4211,9 @@ COMBINED BENCHMARK SUMMARY impl wl p50(ms) ok hf_kernels_layer_norm LN_B16_S2048_D4096 0.83 True hf_kernels_layer_norm LN_B16_S2048_D8192 1.65 True -hf_kernels_layer_norm LN_B16_S4096_D4096 1.65 True +hf_kernels_layer_norm LN_B16_S4096_D4096 1.64 True hf_kernels_layer_norm LN_B16_S4096_D8192 3.26 True -torch_layer_norm LN_B16_S2048_D4096 0.81 True +torch_layer_norm LN_B16_S2048_D4096 0.82 True torch_layer_norm LN_B16_S2048_D8192 1.68 True torch_layer_norm LN_B16_S4096_D4096 1.61 True torch_layer_norm LN_B16_S4096_D8192 3.32 True @@ -4236,7 +4236,7 @@ Implementations included:
▶ UV Install Logs
@@ -4249,7 +4249,7 @@ Installed 37 packages in 299ms - 2025-12-19T19:55:25.441156 + 2025-12-19T23:02:49.888978 image/svg+xml @@ -4333,70 +4333,70 @@ Installed 37 packages in 299ms - + - + - 1.0 + 1.0 - + - + - 1.5 + 1.5 - + - + - 2.0 + 2.0 - + - + - 2.5 + 2.5 - + - + - 3.0 + 3.0 @@ -4404,27 +4404,27 @@ Installed 37 packages in 299ms - + - - + + - + - - - - + + + + diff --git a/openai_moe/impls/artifacts/benchmark/openai_moe.jsonl b/openai_moe/impls/artifacts/benchmark/openai_moe.jsonl index 6f1e11a9e3626bb6778c7d6ca2c41917471685ba..fb420cd78d83c2f7e14a457398abf3c79df7ac8b 100644 --- a/openai_moe/impls/artifacts/benchmark/openai_moe.jsonl +++ b/openai_moe/impls/artifacts/benchmark/openai_moe.jsonl @@ -1,8 +1,8 @@ -{"ts": "2025-12-19T19:54:31Z", "run": "ca1c99ac13bb4217b0fb0c501a249580", "impl": "gpt_oss_experts", "tags": {"family": "reference", "backend": "pytorch"}, "wl": {"name": "cuda_B1_S512_E2", "batch": 1, "seq_len": 512, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 2, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 2.591275999975551, "p50": 2.6265569999850413, "p90": 2.6390279999759514, "mean": 2.626043199961714, "iqr": 0.02082100013467425, "raw_times": [2.591275999975551, 2.6390279999759514, 2.6265569999850413, 2.618206999841277, 2.6551480000307492], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 2.6624880001691054, "peak_bytes": 311252992, "ok": true, "absmax": 1.0818243026733398e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 1.0818243026733398e-05, "mae": 1.0733322142186807e-06, "mse": 1.9560496885423495e-12, "ref": "naive_moe"}, "err": null} -{"ts": "2025-12-19T19:54:31Z", "run": "ca1c99ac13bb4217b0fb0c501a249580", "impl": "gpt_oss_experts", "tags": {"family": "reference", "backend": "pytorch"}, "wl": {"name": "cuda_B1_S512_E4", "batch": 1, "seq_len": 512, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 4, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.913345000000845, "p50": 3.932325000050696, "p90": 3.941766000025382, "mean": 3.9370316000258754, "iqr": 0.02511100001356681, "raw_times": [3.941766000025382, 3.913345000000845, 3.916655000011815, 3.981067000040639, 3.932325000050696], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.937866000114809, "peak_bytes": 632822272, "ok": true, "absmax": 7.82310962677002e-06, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 7.82310962677002e-06, "mae": 5.576844728238939e-07, "mse": 5.436189692842319e-13, "ref": "naive_moe"}, "err": null} -{"ts": "2025-12-19T19:54:32Z", "run": "ca1c99ac13bb4217b0fb0c501a249580", "impl": "gpt_oss_experts", "tags": {"family": "reference", "backend": "pytorch"}, "wl": {"name": "cuda_B1_S1024_E2", "batch": 1, "seq_len": 1024, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 2, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.80903300015234, "p50": 3.849652999861064, "p90": 3.853734000131226, "mean": 3.837069200062615, "iqr": 0.039670999967711396, "raw_times": [3.8140630001635145, 3.8588630000049307, 3.80903300015234, 3.853734000131226, 3.849652999861064], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.788761999885537, "peak_bytes": 645417472, "ok": true, "absmax": 1.5497207641601562e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 1.5497207641601562e-05, "mae": 1.1454358173068613e-06, "mse": 2.2412421311207575e-12, "ref": "naive_moe"}, "err": null} -{"ts": "2025-12-19T19:54:34Z", "run": "ca1c99ac13bb4217b0fb0c501a249580", "impl": "gpt_oss_experts", "tags": {"family": "reference", "backend": "pytorch"}, "wl": {"name": "cuda_B1_S1024_E4", "batch": 1, "seq_len": 1024, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 4, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 5.2778859999307315, "p50": 5.308016000071802, "p90": 5.336937000038233, "mean": 5.31205640004373, "iqr": 0.038680999978168984, "raw_times": [5.339187000117818, 5.336937000038233, 5.298256000060064, 5.308016000071802, 5.2778859999307315], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 5.26179400003457, "peak_bytes": 657099264, "ok": true, "absmax": 6.556510925292969e-06, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 6.556510925292969e-06, "mae": 4.852234951613354e-07, "mse": 4.015021550906467e-13, "ref": "naive_moe"}, "err": null} -{"ts": "2025-12-19T19:54:36Z", "run": "ca1c99ac13bb4217b0fb0c501a249580", "impl": "gpt_oss_experts", "tags": {"family": "reference", "backend": "pytorch"}, "wl": {"name": "cuda_B4_S512_E2", "batch": 4, "seq_len": 512, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 2, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 6.679864000034286, "p50": 6.717303999948854, "p90": 6.729205000056027, "mean": 6.711754200023279, "iqr": 0.028612000050998176, "raw_times": [6.679864000034286, 6.717303999948854, 6.700593000005028, 6.7318050000722, 6.729205000056027], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 6.593322000071566, "peak_bytes": 678357504, "ok": true, "absmax": 1.3589859008789062e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 1.3589859008789062e-05, "mae": 1.1745952406272409e-06, "mse": 2.316181968442521e-12, "ref": "naive_moe"}, "err": null} -{"ts": "2025-12-19T19:54:38Z", "run": "ca1c99ac13bb4217b0fb0c501a249580", "impl": "gpt_oss_experts", "tags": {"family": "reference", "backend": "pytorch"}, "wl": {"name": "cuda_B4_S512_E4", "batch": 4, "seq_len": 512, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 4, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 7.423924000022453, "p50": 7.518165999954363, "p90": 7.529216999955679, "mean": 7.5042842000129895, "iqr": 0.02257999994981219, "raw_times": [7.543477000126586, 7.529216999955679, 7.518165999954363, 7.506637000005867, 7.423924000022453], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 7.323180999946999, "peak_bytes": 701983232, "ok": true, "absmax": 8.58306884765625e-06, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 8.58306884765625e-06, "mae": 5.268635732136318e-07, "mse": 4.753664909623589e-13, "ref": "naive_moe"}, "err": null} -{"ts": "2025-12-19T19:54:42Z", "run": "ca1c99ac13bb4217b0fb0c501a249580", "impl": "gpt_oss_experts", "tags": {"family": "reference", "backend": "pytorch"}, "wl": {"name": "cuda_B4_S1024_E2", "batch": 4, "seq_len": 1024, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 2, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 13.163481999981741, "p50": 13.23755299995355, "p90": 13.251324000066234, "mean": 13.23588719997133, "iqr": 0.04864200013798836, "raw_times": [13.163481999981741, 13.202681999928245, 13.23755299995355, 13.32439499992688, 13.251324000066234], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 13.090128999920125, "peak_bytes": 1012207616, "ok": true, "absmax": 1.71661376953125e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 1.71661376953125e-05, "mae": 1.797086838450923e-06, "mse": 5.3811247992252564e-12, "ref": "naive_moe"}, "err": null} -{"ts": "2025-12-19T19:54:46Z", "run": "ca1c99ac13bb4217b0fb0c501a249580", "impl": "gpt_oss_experts", "tags": {"family": "reference", "backend": "pytorch"}, "wl": {"name": "cuda_B4_S1024_E4", "batch": 4, "seq_len": 1024, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 4, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 13.26829100003124, "p50": 13.362623000148233, "p90": 13.40691399991556, "mean": 13.346813000043767, "iqr": 0.1288519999889104, "raw_times": [13.40691399991556, 13.418175000197152, 13.26829100003124, 13.27806199992665, 13.362623000148233], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 12.873562000095262, "peak_bytes": 910968320, "ok": true, "absmax": 8.344650268554688e-06, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 8.344650268554688e-06, "mae": 5.471991357808292e-07, "mse": 5.06310813587485e-13, "ref": "naive_moe"}, "err": null} +{"ts": "2025-12-19T23:01:31Z", "run": "7f201a1c2ed74237ad40459314918ba0", "impl": "gpt_oss_experts", "tags": {"family": "reference", "backend": "pytorch"}, "wl": {"name": "cuda_B1_S512_E2", "batch": 1, "seq_len": 512, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 2, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 2.642566999838891, "p50": 2.6590969998778746, "p90": 2.673486999810848, "mean": 2.659981199894901, "iqr": 0.023999999939405825, "raw_times": [2.6590969998778746, 2.675268000075448, 2.649486999871442, 2.642566999838891, 2.673486999810848], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 2.7064890000474406, "peak_bytes": 311252992, "ok": true, "absmax": 1.0818243026733398e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 1.0818243026733398e-05, "mae": 1.0733322142186807e-06, "mse": 1.9560496885423495e-12, "ref": "naive_moe"}, "err": null} +{"ts": "2025-12-19T23:01:32Z", "run": "7f201a1c2ed74237ad40459314918ba0", "impl": "gpt_oss_experts", "tags": {"family": "reference", "backend": "pytorch"}, "wl": {"name": "cuda_B1_S512_E4", "batch": 1, "seq_len": 512, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 4, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.936204999945403, "p50": 3.9538260000426817, "p90": 3.9835660002154327, "mean": 3.9606518000255164, "iqr": 0.039130000232034945, "raw_times": [3.936204999945403, 3.9538260000426817, 3.9835660002154327, 3.985225999940667, 3.944435999983398], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.9596259998688765, "peak_bytes": 632822272, "ok": true, "absmax": 7.82310962677002e-06, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 7.82310962677002e-06, "mae": 5.576844728238939e-07, "mse": 5.436189692842319e-13, "ref": "naive_moe"}, "err": null} +{"ts": "2025-12-19T23:01:33Z", "run": "7f201a1c2ed74237ad40459314918ba0", "impl": "gpt_oss_experts", "tags": {"family": "reference", "backend": "pytorch"}, "wl": {"name": "cuda_B1_S1024_E2", "batch": 1, "seq_len": 1024, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 2, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.817872999889005, "p50": 3.868872999873929, "p90": 3.9019339999413205, "mean": 3.8749997999275365, "iqr": 0.044331000026431866, "raw_times": [3.817872999889005, 3.8576029999148886, 3.9287160000185395, 3.9019339999413205, 3.868872999873929], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.836012999954619, "peak_bytes": 645417472, "ok": true, "absmax": 1.5497207641601562e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 1.5497207641601562e-05, "mae": 1.1454358173068613e-06, "mse": 2.2412421311207575e-12, "ref": "naive_moe"}, "err": null} +{"ts": "2025-12-19T23:01:34Z", "run": "7f201a1c2ed74237ad40459314918ba0", "impl": "gpt_oss_experts", "tags": {"family": "reference", "backend": "pytorch"}, "wl": {"name": "cuda_B1_S1024_E4", "batch": 1, "seq_len": 1024, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 4, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 5.3247949999786215, "p50": 5.3401449999910255, "p90": 5.39184700005535, "mean": 5.356893600037438, "iqr": 0.06286200004979037, "raw_times": [5.39184700005535, 5.3247949999786215, 5.3401449999910255, 5.328985000005559, 5.398696000156633], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 5.315443999961644, "peak_bytes": 657099264, "ok": true, "absmax": 6.556510925292969e-06, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 6.556510925292969e-06, "mae": 4.852234951613354e-07, "mse": 4.015021550906467e-13, "ref": "naive_moe"}, "err": null} +{"ts": "2025-12-19T23:01:36Z", "run": "7f201a1c2ed74237ad40459314918ba0", "impl": "gpt_oss_experts", "tags": {"family": "reference", "backend": "pytorch"}, "wl": {"name": "cuda_B4_S512_E2", "batch": 4, "seq_len": 512, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 2, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 6.797146999815595, "p50": 6.804686999885234, "p90": 6.806136000022889, "mean": 6.814822799969988, "iqr": 0.0027099999897473026, "raw_times": [6.862718000093082, 6.806136000022889, 6.797146999815595, 6.8034260000331415, 6.804686999885234], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 6.6412220000984235, "peak_bytes": 678357504, "ok": true, "absmax": 1.3589859008789062e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 1.3589859008789062e-05, "mae": 1.1745952406272409e-06, "mse": 2.316181968442521e-12, "ref": "naive_moe"}, "err": null} +{"ts": "2025-12-19T23:01:38Z", "run": "7f201a1c2ed74237ad40459314918ba0", "impl": "gpt_oss_experts", "tags": {"family": "reference", "backend": "pytorch"}, "wl": {"name": "cuda_B4_S512_E4", "batch": 4, "seq_len": 512, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 4, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 7.520542000065689, "p50": 7.530022999844732, "p90": 7.53409300000385, "mean": 7.531816999926377, "iqr": 0.0065400001858506585, "raw_times": [7.520542000065689, 7.527552999817999, 7.546873999899617, 7.530022999844732, 7.53409300000385], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 7.356247999950938, "peak_bytes": 701983232, "ok": true, "absmax": 8.58306884765625e-06, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 8.58306884765625e-06, "mae": 5.268635732136318e-07, "mse": 4.753664909623589e-13, "ref": "naive_moe"}, "err": null} +{"ts": "2025-12-19T23:01:42Z", "run": "7f201a1c2ed74237ad40459314918ba0", "impl": "gpt_oss_experts", "tags": {"family": "reference", "backend": "pytorch"}, "wl": {"name": "cuda_B4_S1024_E2", "batch": 4, "seq_len": 1024, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 2, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 13.378247999980886, "p50": 13.385679999828426, "p90": 13.397299999951429, "mean": 13.394303199947899, "iqr": 0.012501999890446314, "raw_times": [13.378247999980886, 13.384798000060982, 13.425489999917772, 13.385679999828426, 13.397299999951429], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 13.166785000066739, "peak_bytes": 1012207616, "ok": true, "absmax": 1.71661376953125e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 1.71661376953125e-05, "mae": 1.797086838450923e-06, "mse": 5.3811247992252564e-12, "ref": "naive_moe"}, "err": null} +{"ts": "2025-12-19T23:01:46Z", "run": "7f201a1c2ed74237ad40459314918ba0", "impl": "gpt_oss_experts", "tags": {"family": "reference", "backend": "pytorch"}, "wl": {"name": "cuda_B4_S1024_E4", "batch": 4, "seq_len": 1024, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 4, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 13.328448000038406, "p50": 13.40927800015379, "p90": 13.441681000131211, "mean": 13.402939000070546, "iqr": 0.0636730001133401, "raw_times": [13.457280000011451, 13.441681000131211, 13.40927800015379, 13.378008000017871, 13.328448000038406], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 13.036729999839736, "peak_bytes": 910968320, "ok": true, "absmax": 8.344650268554688e-06, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 8.344650268554688e-06, "mae": 5.471991357808292e-07, "mse": 5.06310813587485e-13, "ref": "naive_moe"}, "err": null} diff --git a/openai_moe/impls/binned_torch.html b/openai_moe/impls/binned_torch.html index 9b8dbbaed830012937357a2b35ef53cf162e2359..bbeb179f09c83fec11c75019ad7e4482d971ac4f 100644 --- a/openai_moe/impls/binned_torch.html +++ b/openai_moe/impls/binned_torch.html @@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content { ▼ output ▶ uv-logs | -Cell: nv | 0.28s +Cell: nv | 0.25s | Raw @@ -3904,7 +3904,7 @@ Cell: nv | 0.28s
-
Fri Dec 19 19:41:48 2025       
+
Fri Dec 19 23:00:37 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 580.105.08             Driver Version: 580.105.08     CUDA Version: 13.0     |
 +-----------------------------------------+------------------------+----------------------+
@@ -3913,7 +3913,7 @@ Cell: nv | 0.28s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   33C    P0            126W /  350W |       0MiB /  46068MiB |    100%      Default |
+| N/A   40C    P0             84W /  350W |       0MiB /  46068MiB |     60%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -3937,7 +3937,7 @@ Cell: nv | 0.28s
 ▼ output
  ▶ uv-logs
  | 
-Cell: benchmark | 733.46s
+Cell: benchmark | 723.84s
  | 
 
 Raw
@@ -4095,29 +4095,29 @@ PROFILE TRACE: binned_torch | cuda_B1_S512_E2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                           binned_torch         0.00%       0.000us         0.00%       0.000us       0.000us     935.516ms      1843.92%     935.516ms     935.516ms             1  
-                                           binned_torch        24.73%     231.815ms       100.00%     937.553ms     937.553ms       0.000us         0.00%      50.740ms      50.740ms             1  
-                                             aten::item         1.92%      17.997ms        26.19%     245.573ms      16.003us       0.000us         0.00%      15.756ms       1.027us         15345  
-                              aten::_local_scalar_dense         6.46%      60.533ms        24.27%     227.576ms      14.831us      15.755ms        31.05%      15.756ms       1.027us         15345  
-                         Memcpy DtoH (Device -> Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us      15.755ms        31.05%      15.755ms       1.027us         15345  
-                                     aten::floor_divide         5.33%      49.954ms        13.00%     121.926ms      19.845us       7.813ms        15.40%       7.813ms       1.272us          6144  
-                                              aten::bmm         0.02%     192.684us         0.02%     232.345us      38.724us       7.792ms        15.36%       7.792ms       1.299ms             6  
-                                ampere_sgemm_128x128_nn         0.00%       0.000us         0.00%       0.000us       0.000us       7.792ms        15.36%       7.792ms       1.299ms             6  
-                                            aten::copy_         3.73%      34.970ms         9.17%      86.008ms      13.971us       6.589ms        12.99%       6.590ms       1.071us          6156  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.585ms        12.98%       6.585ms       1.070us          6153  
-                                              aten::mul         3.28%      30.750ms         5.69%      53.382ms      17.326us       4.708ms         9.28%       4.708ms       1.528us          3081  
-void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       4.480ms         8.83%       4.480ms       1.458us          3072  
-void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       4.159ms         8.20%       4.159ms       1.354us          3072  
-                                        aten::remainder         3.15%      29.490ms         4.77%      44.737ms      14.563us       3.838ms         7.56%       3.838ms       1.249us          3072  
-                                              aten::add         2.76%      25.910ms         4.76%      44.643ms      14.719us       3.755ms         7.40%       3.755ms       1.238us          3033  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       3.655ms         7.20%       3.655ms       1.190us          3072  
-void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       3.364ms         6.63%       3.364ms       1.110us          3030  
-void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.022ms         3.99%       2.022ms       1.316us          1536  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       1.816ms         3.58%       1.816ms       1.182us          1536  
-void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     284.802us         0.56%     284.802us      47.467us             6  
+                                           binned_torch         0.00%       0.000us         0.00%       0.000us       0.000us     916.334ms      1818.27%     916.334ms     916.334ms             1  
+                                           binned_torch        24.63%     226.221ms       100.00%     918.346ms     918.346ms       0.000us         0.00%      50.398ms      50.398ms             1  
+                                             aten::item         1.84%      16.915ms        25.73%     236.247ms      15.396us       0.000us         0.00%      15.727ms       1.025us         15345  
+                              aten::_local_scalar_dense         5.92%      54.373ms        23.88%     219.332ms      14.293us      15.726ms        31.20%      15.727ms       1.025us         15345  
+                         Memcpy DtoH (Device -> Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us      15.726ms        31.20%      15.726ms       1.025us         15345  
+                                              aten::bmm         0.02%     194.226us         0.03%     236.195us      39.366us       8.013ms        15.90%       8.013ms       1.336ms             6  
+                                ampere_sgemm_128x128_nn         0.00%       0.000us         0.00%       0.000us       0.000us       8.013ms        15.90%       8.013ms       1.336ms             6  
+                                     aten::floor_divide         5.35%      49.157ms        13.15%     120.743ms      19.652us       7.547ms        14.98%       7.547ms       1.228us          6144  
+                                            aten::copy_         3.75%      34.457ms         9.21%      84.535ms      13.732us       6.589ms        13.08%       6.592ms       1.071us          6156  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.585ms        13.07%       6.585ms       1.070us          6153  
+                                              aten::mul         3.14%      28.847ms         5.63%      51.742ms      16.794us       4.707ms         9.34%       4.707ms       1.528us          3081  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       4.479ms         8.89%       4.479ms       1.458us          3072  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       4.026ms         7.99%       4.026ms       1.311us          3072  
+                                        aten::remainder         3.09%      28.363ms         4.76%      43.750ms      14.241us       3.702ms         7.35%       3.702ms       1.205us          3072  
+                                              aten::add         2.79%      25.584ms         4.81%      44.150ms      14.557us       3.631ms         7.20%       3.631ms       1.197us          3033  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       3.522ms         6.99%       3.522ms       1.147us          3072  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       3.235ms         6.42%       3.235ms       1.068us          3030  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       1.954ms         3.88%       1.954ms       1.272us          1536  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       1.749ms         3.47%       1.749ms       1.138us          1536  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     287.138us         0.57%     287.138us      47.856us             6  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 937.562ms
-Self CUDA time total: 50.735ms
+Self CPU time total: 918.353ms
+Self CUDA time total: 50.396ms
 
 
 
@@ -4127,29 +4127,29 @@ PROFILE TRACE: binned_torch | cuda_B1_S512_E4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                           binned_torch         0.00%       0.000us         0.00%       0.000us       0.000us     958.363ms      1758.28%     958.363ms     958.363ms             1  
-                                           binned_torch        24.25%     232.525ms       100.00%     958.754ms     958.754ms       0.000us         0.00%      54.510ms      54.510ms             1  
-                                             aten::item         1.77%      17.002ms        27.44%     263.071ms      15.534us       0.000us         0.00%      17.862ms       1.055us         16935  
-                              aten::_local_scalar_dense         6.54%      62.707ms        25.67%     246.070ms      14.530us      17.860ms        32.77%      17.862ms       1.055us         16935  
-                         Memcpy DtoH (Device -> Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us      17.860ms        32.77%      17.860ms       1.055us         16935  
-                                              aten::bmm         0.02%     170.065us         0.02%     212.615us      35.436us       7.895ms        14.48%       7.895ms       1.316ms             6  
-                                ampere_sgemm_128x128_nn         0.00%       0.000us         0.00%       0.000us       0.000us       7.895ms        14.48%       7.895ms       1.316ms             6  
-                                     aten::floor_divide         4.96%      47.565ms        12.31%     117.977ms      19.202us       7.812ms        14.33%       7.813ms       1.272us          6144  
-                                            aten::copy_         3.61%      34.645ms         8.68%      83.187ms      13.513us       6.631ms        12.17%       6.631ms       1.077us          6156  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.628ms        12.16%       6.628ms       1.077us          6152  
-                                              aten::add         3.91%      37.531ms         7.22%      69.217ms      15.070us       5.262ms         9.65%       5.262ms       1.146us          4593  
-                                              aten::mul         3.03%      29.029ms         5.30%      50.820ms      16.495us       4.703ms         8.63%       4.703ms       1.526us          3081  
-void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       4.476ms         8.21%       4.476ms       1.457us          3072  
-void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       4.156ms         7.62%       4.156ms       1.353us          3072  
-                                        aten::remainder         2.84%      27.273ms         4.45%      42.673ms      13.891us       3.854ms         7.07%       3.854ms       1.255us          3072  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       3.656ms         6.71%       3.656ms       1.190us          3072  
-void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       3.271ms         6.00%       3.271ms       1.080us          3030  
-void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.031ms         3.73%       2.031ms       1.323us          1536  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       1.822ms         3.34%       1.822ms       1.187us          1536  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       1.585ms         2.91%       1.585ms       1.016us          1560  
+                                           binned_torch         0.00%       0.000us         0.00%       0.000us       0.000us     930.604ms      1724.65%     930.604ms     930.604ms             1  
+                                           binned_torch        24.29%     226.115ms       100.00%     930.865ms     930.865ms       0.000us         0.00%      53.966ms      53.966ms             1  
+                                             aten::item         1.81%      16.815ms        27.55%     256.425ms      15.142us       0.000us         0.00%      17.838ms       1.053us         16935  
+                              aten::_local_scalar_dense         6.14%      57.141ms        25.74%     239.611ms      14.149us      17.835ms        33.05%      17.838ms       1.053us         16935  
+                         Memcpy DtoH (Device -> Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us      17.835ms        33.05%      17.835ms       1.053us         16935  
+                                              aten::bmm         0.02%     175.424us         0.02%     217.325us      36.221us       7.967ms        14.77%       7.967ms       1.328ms             6  
+                                ampere_sgemm_128x128_nn         0.00%       0.000us         0.00%       0.000us       0.000us       7.967ms        14.77%       7.967ms       1.328ms             6  
+                                     aten::floor_divide         5.05%      47.005ms        12.57%     117.000ms      19.043us       7.550ms        13.99%       7.551ms       1.229us          6144  
+                                            aten::copy_         3.51%      32.640ms         8.36%      77.831ms      12.643us       6.635ms        12.30%       6.635ms       1.078us          6156  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.632ms        12.29%       6.632ms       1.078us          6152  
+                                              aten::add         3.89%      36.256ms         6.95%      64.697ms      14.086us       5.059ms         9.38%       5.059ms       1.102us          4593  
+                                              aten::mul         2.92%      27.144ms         5.32%      49.502ms      16.067us       4.707ms         8.72%       4.707ms       1.528us          3081  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       4.479ms         8.30%       4.479ms       1.458us          3072  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       4.026ms         7.46%       4.026ms       1.310us          3072  
+                                        aten::remainder         2.81%      26.197ms         4.49%      41.800ms      13.607us       3.721ms         6.90%       3.721ms       1.211us          3072  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       3.524ms         6.53%       3.524ms       1.147us          3072  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       3.140ms         5.82%       3.140ms       1.036us          3030  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       1.965ms         3.64%       1.965ms       1.279us          1536  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       1.756ms         3.25%       1.756ms       1.143us          1536  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       1.517ms         2.81%       1.517ms       0.972us          1560  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 958.762ms
-Self CUDA time total: 54.506ms
+Self CPU time total: 930.874ms
+Self CUDA time total: 53.959ms
 
 
 
@@ -4159,29 +4159,29 @@ PROFILE TRACE: binned_torch | cuda_B1_S1024_E2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                           binned_torch         0.00%       0.000us         0.00%       0.000us       0.000us        1.754s      1688.21%        1.754s        1.754s             1  
-                                           binned_torch        24.13%     423.200ms       100.00%        1.754s        1.754s       0.000us         0.00%     103.889ms     103.889ms             1  
-                                             aten::item         1.68%      29.485ms        26.54%     465.492ms      15.256us       0.000us         0.00%      31.587ms       1.035us         30513  
-                              aten::_local_scalar_dense         6.17%     108.158ms        24.86%     436.007ms      14.289us      31.585ms        30.40%      31.587ms       1.035us         30513  
-                         Memcpy DtoH (Device -> Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us      31.585ms        30.40%      31.585ms       1.035us         30513  
-                                     aten::floor_divide         5.33%      93.524ms        13.33%     233.711ms      19.019us      15.605ms        15.02%      15.605ms       1.270us         12288  
-                                              aten::bmm         0.01%     221.157us         0.02%     267.387us      44.564us      15.098ms        14.53%      15.098ms       2.516ms             6  
-                                ampere_sgemm_128x128_nn         0.00%       0.000us         0.00%       0.000us       0.000us      15.098ms        14.53%      15.098ms       2.516ms             6  
-                                            aten::copy_         3.90%      68.459ms         9.45%     165.766ms      13.477us      13.325ms        12.83%      13.325ms       1.083us         12300  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      13.322ms        12.82%      13.322ms       1.084us         12294  
-                                              aten::mul         3.29%      57.635ms         5.89%     103.357ms      16.798us      11.271ms        10.85%      11.273ms       1.832us          6153  
-void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       9.920ms         9.55%       9.920ms       1.615us          6144  
-void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       8.308ms         8.00%       8.308ms       1.352us          6144  
-                                        aten::remainder         3.09%      54.193ms         4.85%      85.026ms      13.839us       7.675ms         7.39%       7.675ms       1.249us          6144  
-                                              aten::add         2.79%      48.989ms         4.92%      86.297ms      14.595us       7.638ms         7.35%       7.639ms       1.292us          5913  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       7.297ms         7.02%       7.297ms       1.188us          6144  
-void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       6.357ms         6.12%       6.357ms       1.076us          5910  
-void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       4.044ms         3.89%       4.044ms       1.317us          3072  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       3.632ms         3.50%       3.632ms       1.182us          3072  
-                                            aten::clamp         0.00%      73.899us         0.01%     123.411us      20.569us       1.193ms         1.15%       1.193ms     198.833us             6  
+                                           binned_torch         0.00%       0.000us         0.00%       0.000us       0.000us        1.706s      1653.15%        1.706s        1.706s             1  
+                                           binned_torch        24.03%     409.734ms       100.00%        1.705s        1.705s       0.000us         0.00%     103.183ms     103.183ms             1  
+                                             aten::item         1.59%      27.070ms        26.54%     452.490ms      14.829us       0.000us         0.00%      31.572ms       1.035us         30513  
+                              aten::_local_scalar_dense         5.90%     100.602ms        24.95%     425.421ms      13.942us      31.568ms        30.60%      31.572ms       1.035us         30513  
+                         Memcpy DtoH (Device -> Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us      31.568ms        30.60%      31.568ms       1.035us         30513  
+                                              aten::bmm         0.01%     213.024us         0.02%     261.877us      43.646us      15.473ms        15.00%      15.473ms       2.579ms             6  
+                                ampere_sgemm_128x128_nn         0.00%       0.000us         0.00%       0.000us       0.000us      15.473ms        15.00%      15.473ms       2.579ms             6  
+                                     aten::floor_divide         5.42%      92.355ms        13.36%     227.861ms      18.543us      15.078ms        14.61%      15.078ms       1.227us         12288  
+                                            aten::copy_         3.96%      67.445ms         9.41%     160.444ms      13.044us      13.330ms        12.92%      13.330ms       1.084us         12300  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      13.326ms        12.92%      13.326ms       1.084us         12294  
+                                              aten::mul         3.18%      54.204ms         5.76%      98.288ms      15.974us      11.263ms        10.92%      11.265ms       1.831us          6153  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       9.919ms         9.61%       9.919ms       1.614us          6144  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       8.044ms         7.80%       8.044ms       1.309us          6144  
+                                        aten::remainder         3.09%      52.622ms         4.84%      82.495ms      13.427us       7.409ms         7.18%       7.409ms       1.206us          6144  
+                                              aten::add         2.82%      48.063ms         4.95%      84.371ms      14.269us       7.380ms         7.15%       7.380ms       1.248us          5913  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       7.034ms         6.82%       7.034ms       1.145us          6144  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       6.098ms         5.91%       6.098ms       1.032us          5910  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       3.912ms         3.79%       3.912ms       1.273us          3072  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       3.498ms         3.39%       3.498ms       1.139us          3072  
+                                            aten::clamp         0.00%      70.381us         0.01%     115.343us      19.224us       1.182ms         1.15%       1.182ms     197.026us             6  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.754s
-Self CUDA time total: 103.882ms
+Self CPU time total: 1.705s
+Self CUDA time total: 103.179ms
 
 
 
@@ -4191,29 +4191,29 @@ PROFILE TRACE: binned_torch | cuda_B1_S1024_E4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                           binned_torch         0.00%       0.000us         0.00%       0.000us       0.000us        1.874s      1695.99%        1.874s        1.874s             1  
-                                           binned_torch        24.25%     455.076ms       100.00%        1.876s        1.876s       0.000us         0.00%     110.516ms     110.516ms             1  
-                                             aten::item         1.77%      33.154ms        27.43%     514.675ms      15.259us       0.000us         0.00%      34.979ms       1.037us         33729  
-                              aten::_local_scalar_dense         6.27%     117.583ms        25.66%     481.520ms      14.276us      34.976ms        31.65%      34.979ms       1.037us         33729  
-                         Memcpy DtoH (Device -> Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us      34.976ms        31.65%      34.976ms       1.037us         33728  
-                                     aten::floor_divide         4.89%      91.819ms        12.09%     226.952ms      18.469us      15.582ms        14.10%      15.582ms       1.268us         12288  
-                                              aten::bmm         0.01%     222.715us         0.01%     267.616us      44.603us      15.462ms        13.99%      15.462ms       2.577ms             6  
-                                ampere_sgemm_128x128_nn         0.00%       0.000us         0.00%       0.000us       0.000us      15.462ms        13.99%      15.462ms       2.577ms             6  
-                                            aten::copy_         3.58%      67.106ms         8.62%     161.781ms      13.153us      13.339ms        12.07%      13.341ms       1.085us         12300  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      13.337ms        12.07%      13.337ms       1.085us         12294  
-                                              aten::mul         3.09%      57.893ms         5.35%     100.363ms      16.311us      10.926ms         9.89%      10.927ms       1.776us          6153  
-                                              aten::add         4.06%      76.225ms         6.94%     130.290ms      14.319us      10.845ms         9.81%      10.845ms       1.192us          9099  
-void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       9.572ms         8.66%       9.572ms       1.558us          6144  
-void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       8.302ms         7.51%       8.302ms       1.351us          6144  
-                                        aten::remainder         2.99%      56.031ms         4.55%      85.473ms      13.912us       7.682ms         6.95%       7.682ms       1.250us          6144  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       7.280ms         6.59%       7.280ms       1.185us          6144  
-void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       6.358ms         5.75%       6.358ms       1.076us          5910  
-void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       4.050ms         3.67%       4.050ms       1.318us          3072  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       3.631ms         3.29%       3.631ms       1.182us          3072  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       3.228ms         2.92%       3.228ms       1.013us          3186  
+                                           binned_torch         0.00%       0.000us         0.00%       0.000us       0.000us        1.835s      1676.06%        1.835s        1.835s             1  
+                                           binned_torch        24.11%     442.690ms       100.00%        1.836s        1.836s       0.000us         0.00%     109.503ms     109.503ms             1  
+                                             aten::item         1.62%      29.702ms        27.50%     504.982ms      14.972us       0.000us         0.00%      35.015ms       1.038us         33729  
+                              aten::_local_scalar_dense         6.21%     114.112ms        25.88%     475.279ms      14.091us      35.012ms        31.97%      35.015ms       1.038us         33729  
+                         Memcpy DtoH (Device -> Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us      35.012ms        31.97%      35.012ms       1.038us         33728  
+                                              aten::bmm         0.01%     232.655us         0.02%     282.685us      47.114us      15.567ms        14.22%      15.567ms       2.595ms             6  
+                                ampere_sgemm_128x128_nn         0.00%       0.000us         0.00%       0.000us       0.000us      15.567ms        14.22%      15.567ms       2.595ms             6  
+                                     aten::floor_divide         5.11%      93.914ms        12.52%     229.926ms      18.711us      15.067ms        13.76%      15.067ms       1.226us         12288  
+                                            aten::copy_         3.50%      64.191ms         8.58%     157.627ms      12.815us      13.353ms        12.19%      13.355ms       1.086us         12300  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      13.350ms        12.19%      13.350ms       1.086us         12294  
+                                              aten::mul         2.97%      54.553ms         5.34%      97.962ms      15.921us      10.925ms         9.98%      10.925ms       1.776us          6153  
+                                              aten::add         3.96%      72.764ms         6.93%     127.157ms      13.975us      10.457ms         9.55%      10.457ms       1.149us          9099  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       9.572ms         8.74%       9.572ms       1.558us          6144  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       8.046ms         7.35%       8.046ms       1.310us          6144  
+                                        aten::remainder         2.95%      54.099ms         4.66%      85.633ms      13.938us       7.422ms         6.78%       7.422ms       1.208us          6144  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       7.021ms         6.41%       7.021ms       1.143us          6144  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       6.106ms         5.58%       6.106ms       1.033us          5910  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       3.920ms         3.58%       3.920ms       1.276us          3072  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       3.502ms         3.20%       3.502ms       1.140us          3072  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       3.094ms         2.83%       3.094ms       0.971us          3186  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.876s
-Self CUDA time total: 110.507ms
+Self CPU time total: 1.836s
+Self CUDA time total: 109.497ms
 
 
 
@@ -4223,29 +4223,29 @@ PROFILE TRACE: binned_torch | cuda_B4_S512_E2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                           binned_torch         0.00%       0.000us         0.00%       0.000us       0.000us        3.610s      1697.16%        3.610s        3.610s             1  
-                                           binned_torch        23.68%     855.222ms       100.00%        3.611s        3.611s       0.000us         0.00%     212.735ms     212.735ms             1  
-                         Memcpy DtoH (Device -> Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us      63.569ms        29.88%      63.569ms       1.032us         61587  
-                                             aten::item         1.81%      65.197ms        27.34%     987.119ms      16.028us       0.000us         0.00%      63.568ms       1.032us         61587  
-                              aten::_local_scalar_dense         6.48%     233.826ms        25.53%     921.922ms      14.969us      63.567ms        29.88%      63.568ms       1.032us         61587  
-                                     aten::floor_divide         5.24%     189.036ms        13.02%     470.235ms      19.134us      31.579ms        14.85%      31.582ms       1.285us         24576  
-                                              aten::bmm         0.01%     232.455us         0.01%     281.845us      46.974us      29.001ms        13.63%      29.001ms       4.833ms             6  
-                                ampere_sgemm_128x128_nn         0.00%       0.000us         0.00%       0.000us       0.000us      29.001ms        13.63%      29.001ms       4.833ms             6  
-                                            aten::copy_         3.67%     132.477ms         9.25%     334.079ms      13.587us      26.719ms        12.56%      26.722ms       1.087us         24588  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      26.715ms        12.56%      26.715ms       1.087us         24585  
-                                              aten::mul         3.15%     113.903ms         5.68%     205.201ms      16.687us      25.580ms        12.03%      25.582ms       2.080us         12297  
-void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.132ms        10.40%      22.132ms       1.801us         12288  
-void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.992ms         7.99%      16.992ms       1.383us         12288  
-                                              aten::add         2.81%     101.355ms         4.98%     179.658ms      14.476us      16.634ms         7.82%      16.635ms       1.340us         12411  
-                                        aten::remainder         3.15%     113.609ms         4.99%     180.020ms      14.650us      15.413ms         7.25%      15.415ms       1.255us         12288  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      14.588ms         6.86%      14.588ms       1.187us         12288  
-void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.512ms         6.35%      13.512ms       1.089us         12408  
-void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       8.121ms         3.82%       8.121ms       1.322us          6144  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       7.292ms         3.43%       7.292ms       1.187us          6144  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.612ms         1.23%       2.612ms     435.298us             6  
+                                           binned_torch         0.00%       0.000us         0.00%       0.000us       0.000us        3.483s      1652.23%        3.483s        3.483s             1  
+                                           binned_torch        24.18%     842.026ms       100.00%        3.482s        3.482s       0.000us         0.00%     210.838ms     210.838ms             1  
+                         Memcpy DtoH (Device -> Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us      63.561ms        30.15%      63.561ms       1.032us         61586  
+                                             aten::item         1.74%      60.466ms        26.96%     938.865ms      15.245us       0.000us         0.00%      63.559ms       1.032us         61587  
+                              aten::_local_scalar_dense         6.04%     210.488ms        25.22%     878.295ms      14.261us      63.559ms        30.15%      63.559ms       1.032us         61587  
+                                     aten::floor_divide         5.38%     187.378ms        13.29%     462.870ms      18.834us      30.531ms        14.48%      30.538ms       1.243us         24576  
+                                              aten::bmm         0.01%     232.923us         0.01%     283.154us      47.192us      29.267ms        13.88%      29.267ms       4.878ms             6  
+                                ampere_sgemm_128x128_nn         0.00%       0.000us         0.00%       0.000us       0.000us      29.267ms        13.88%      29.267ms       4.878ms             6  
+                                            aten::copy_         3.71%     129.087ms         8.89%     309.556ms      12.590us      26.727ms        12.68%      26.728ms       1.087us         24588  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      26.725ms        12.68%      26.725ms       1.087us         24582  
+                                              aten::mul         3.12%     108.737ms         5.69%     198.327ms      16.128us      25.576ms        12.13%      25.578ms       2.080us         12297  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.134ms        10.50%      22.134ms       1.801us         12288  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.473ms         7.81%      16.473ms       1.341us         12288  
+                                              aten::add         2.81%      97.833ms         4.96%     172.866ms      13.928us      16.092ms         7.63%      16.093ms       1.297us         12411  
+                                        aten::remainder         3.07%     106.957ms         4.82%     167.982ms      13.670us      14.887ms         7.06%      14.889ms       1.212us         12288  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      14.058ms         6.67%      14.058ms       1.144us         12288  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us      12.970ms         6.15%      12.970ms       1.045us         12408  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       7.857ms         3.73%       7.857ms       1.279us          6144  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       7.030ms         3.33%       7.030ms       1.144us          6144  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.605ms         1.24%       2.605ms     434.242us             6  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 3.611s
-Self CUDA time total: 212.720ms
+Self CPU time total: 3.483s
+Self CUDA time total: 210.821ms
 
 
 
@@ -4255,29 +4255,29 @@ PROFILE TRACE: binned_torch | cuda_B4_S512_E4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                           binned_torch         0.00%       0.000us         0.00%       0.000us       0.000us        3.762s      1666.70%        3.762s        3.762s             1  
-                                           binned_torch        23.91%     899.748ms       100.00%        3.764s        3.764s       0.000us         0.00%     225.734ms     225.734ms             1  
-                                             aten::item         1.82%      68.620ms        27.46%        1.034s      15.235us       0.000us         0.00%      69.795ms       1.029us         67845  
-                              aten::_local_scalar_dense         6.31%     237.441ms        25.64%     964.994ms      14.224us      69.792ms        30.92%      69.795ms       1.029us         67845  
-                         Memcpy DtoH (Device -> Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us      69.793ms        30.92%      69.793ms       1.029us         67840  
-                                     aten::floor_divide         4.95%     186.290ms        12.17%     458.105ms      18.640us      31.553ms        13.98%      31.560ms       1.284us         24576  
-                                              aten::bmm         0.01%     226.315us         0.01%     272.505us      45.418us      29.269ms        12.97%      29.269ms       4.878ms             6  
-                                ampere_sgemm_128x128_nn         0.00%       0.000us         0.00%       0.000us       0.000us      29.269ms        12.97%      29.269ms       4.878ms             6  
-                                            aten::copy_         3.56%     134.013ms         8.54%     321.380ms      13.071us      26.742ms        11.85%      26.743ms       1.088us         24588  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      26.740ms        11.85%      26.740ms       1.088us         24581  
-                                              aten::mul         3.06%     115.077ms         5.31%     199.757ms      16.244us      25.618ms        11.35%      25.618ms       2.083us         12297  
-                                              aten::add         4.14%     155.825ms         7.08%     266.365ms      14.291us      23.275ms        10.31%      23.276ms       1.249us         18639  
-void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.160ms         9.82%      22.160ms       1.803us         12288  
-void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us      17.005ms         7.53%      17.005ms       1.384us         12287  
-                                        aten::remainder         2.93%     110.282ms         4.49%     168.952ms      13.749us      15.362ms         6.81%      15.364ms       1.250us         12288  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      14.548ms         6.45%      14.548ms       1.184us         12287  
-void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.690ms         6.07%      13.690ms       1.103us         12407  
-void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       8.098ms         3.59%       8.098ms       1.318us          6144  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       7.264ms         3.22%       7.264ms       1.182us          6144  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.476ms         2.87%       6.476ms       1.040us          6228  
+                                           binned_torch         0.00%       0.000us         0.00%       0.000us       0.000us        3.725s      1668.35%        3.725s        3.725s             1  
+                                           binned_torch        24.05%     896.242ms       100.00%        3.727s        3.727s       0.000us         0.00%     223.307ms     223.307ms             1  
+                                             aten::item         1.73%      64.547ms        27.53%        1.026s      15.123us       0.000us         0.00%      69.633ms       1.026us         67845  
+                              aten::_local_scalar_dense         6.19%     230.534ms        25.80%     961.495ms      14.172us      69.631ms        31.18%      69.633ms       1.026us         67845  
+                         Memcpy DtoH (Device -> Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us      69.632ms        31.18%      69.632ms       1.026us         67841  
+                                     aten::floor_divide         5.09%     189.838ms        12.50%     465.764ms      18.952us      30.442ms        13.63%      30.448ms       1.239us         24576  
+                                              aten::bmm         0.01%     247.707us         0.01%     294.697us      49.116us      29.554ms        13.24%      29.554ms       4.926ms             6  
+                                ampere_sgemm_128x128_nn         0.00%       0.000us         0.00%       0.000us       0.000us      29.554ms        13.24%      29.554ms       4.926ms             6  
+                                            aten::copy_         3.50%     130.326ms         8.36%     311.636ms      12.674us      26.718ms        11.97%      26.719ms       1.087us         24588  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      26.715ms        11.96%      26.715ms       1.087us         24581  
+                                              aten::mul         2.92%     108.800ms         5.34%     198.878ms      16.173us      25.547ms        11.44%      25.547ms       2.077us         12297  
+                                              aten::add         3.96%     147.436ms         7.04%     262.447ms      14.081us      22.490ms        10.07%      22.492ms       1.207us         18639  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.115ms         9.90%      22.115ms       1.800us         12288  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.451ms         7.37%      16.451ms       1.339us         12287  
+                                        aten::remainder         2.81%     104.739ms         4.44%     165.425ms      13.462us      14.805ms         6.63%      14.806ms       1.205us         12288  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      13.992ms         6.27%      13.992ms       1.139us         12287  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.166ms         5.90%      13.166ms       1.061us         12407  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       7.819ms         3.50%       7.819ms       1.273us          6144  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.986ms         3.13%       6.986ms       1.137us          6144  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.214ms         2.78%       6.214ms       0.998us          6228  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 3.764s
-Self CUDA time total: 225.722ms
+Self CPU time total: 3.727s
+Self CUDA time total: 223.293ms
 
 
 
@@ -4287,29 +4287,29 @@ PROFILE TRACE: binned_torch | cuda_B4_S1024_E2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                           binned_torch         0.00%       0.000us         0.00%       0.000us       0.000us        7.172s      1685.34%        7.172s        7.172s             1  
-                                           binned_torch        23.83%        1.712s       100.00%        7.184s        7.184s       0.000us         0.00%     425.602ms     425.602ms             1  
-                                             aten::item         1.77%     127.233ms        27.17%        1.952s      15.898us       0.000us         0.00%     127.069ms       1.035us        122763  
-                              aten::_local_scalar_dense         6.22%     446.668ms        25.40%        1.825s      14.862us     127.060ms        29.86%     127.069ms       1.035us        122763  
-                         Memcpy DtoH (Device -> Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us     127.060ms        29.86%     127.060ms       1.035us        122762  
-                                     aten::floor_divide         5.22%     375.373ms        13.07%     938.750ms      19.099us      63.372ms        14.89%      63.374ms       1.289us         49152  
-                                ampere_sgemm_128x128_nn         0.00%       0.000us         0.00%       0.000us       0.000us      57.057ms        13.41%      57.057ms       9.509ms             6  
-                                              aten::bmm         0.00%     232.954us         0.00%     280.556us      46.759us      57.057ms        13.41%      57.057ms       9.509ms             6  
-                                            aten::copy_         3.67%     263.382ms         9.14%     656.814ms      13.361us      53.605ms        12.60%      53.606ms       1.090us         49158  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      53.603ms        12.60%      53.603ms       1.091us         49154  
-                                              aten::mul         3.19%     229.239ms         5.71%     410.065ms      16.679us      51.561ms        12.12%      51.568ms       2.098us         24585  
-void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      44.597ms        10.48%      44.597ms       1.815us         24576  
-void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us      34.170ms         8.03%      34.170ms       1.390us         24576  
-                                              aten::add         2.78%     199.917ms         4.97%     356.982ms      14.609us      33.583ms         7.89%      33.584ms       1.374us         24435  
-                                        aten::remainder         3.17%     227.943ms         4.97%     356.780ms      14.517us      30.902ms         7.26%      30.903ms       1.257us         24576  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      29.202ms         6.86%      29.202ms       1.188us         24576  
-void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us      26.924ms         6.33%      26.924ms       1.102us         24431  
-void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.278ms         3.82%      16.278ms       1.325us         12288  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      14.628ms         3.44%      14.628ms       1.190us         12288  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       5.242ms         1.23%       5.242ms     873.601us             6  
+                                           binned_torch         0.00%       0.000us         0.00%       0.000us       0.000us        6.919s      1639.48%        6.919s        6.919s             1  
+                                           binned_torch        24.46%        1.695s       100.00%        6.929s        6.929s       0.000us         0.00%     422.036ms     422.036ms             1  
+                                             aten::item         1.67%     115.500ms        26.73%        1.852s      15.089us       0.000us         0.00%     127.102ms       1.035us        122763  
+                              aten::_local_scalar_dense         5.94%     411.594ms        25.07%        1.737s      14.148us     127.094ms        30.12%     127.102ms       1.035us        122763  
+                         Memcpy DtoH (Device -> Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us     127.096ms        30.12%     127.096ms       1.035us        122762  
+                                     aten::floor_divide         5.38%     373.026ms        13.30%     921.425ms      18.746us      61.339ms        14.53%      61.343ms       1.248us         49152  
+                                              aten::bmm         0.00%     231.234us         0.00%     280.225us      46.704us      57.287ms        13.57%      57.287ms       9.548ms             6  
+                                ampere_sgemm_128x128_nn         0.00%       0.000us         0.00%       0.000us       0.000us      57.287ms        13.57%      57.287ms       9.548ms             6  
+                                            aten::copy_         3.72%     257.654ms         8.91%     617.063ms      12.553us      53.696ms        12.72%      53.697ms       1.092us         49158  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      53.694ms        12.72%      53.694ms       1.092us         49154  
+                                              aten::mul         3.13%     217.096ms         5.68%     393.622ms      16.011us      51.639ms        12.24%      51.644ms       2.101us         24585  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      44.676ms        10.59%      44.676ms       1.818us         24576  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us      33.163ms         7.86%      33.163ms       1.349us         24576  
+                                              aten::add         2.81%     194.866ms         4.91%     340.544ms      13.937us      32.585ms         7.72%      32.588ms       1.334us         24435  
+                                        aten::remainder         3.09%     213.993ms         4.85%     335.801ms      13.664us      29.914ms         7.09%      29.918ms       1.217us         24576  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      28.177ms         6.68%      28.177ms       1.147us         24576  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us      25.921ms         6.14%      25.921ms       1.061us         24431  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us      15.786ms         3.74%      15.786ms       1.285us         12288  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      14.129ms         3.35%      14.129ms       1.150us         12288  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       5.239ms         1.24%       5.239ms     873.180us             6  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 7.184s
-Self CUDA time total: 425.579ms
+Self CPU time total: 6.929s
+Self CUDA time total: 422.014ms
 
 
 
@@ -4319,45 +4319,45 @@ PROFILE TRACE: binned_torch | cuda_B4_S1024_E4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                           binned_torch         0.00%       0.000us         0.00%       0.000us       0.000us        7.590s      1687.04%        7.590s        7.590s             1  
-                                           binned_torch        23.93%        1.817s       100.00%        7.592s        7.592s       0.000us         0.00%     449.935ms     449.935ms             1  
-                                             aten::item         1.74%     131.929ms        27.26%        2.070s      15.365us       0.000us         0.00%     139.467ms       1.035us        134715  
-                              aten::_local_scalar_dense         6.36%     483.083ms        25.53%        1.938s      14.386us     139.456ms        31.00%     139.467ms       1.035us        134715  
-                         Memcpy DtoH (Device -> Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us     139.456ms        31.00%     139.456ms       1.035us        134706  
-                                     aten::floor_divide         4.94%     375.293ms        12.19%     925.665ms      18.833us      63.455ms        14.10%      63.460ms       1.291us         49152  
-                                              aten::bmm         0.00%     234.075us         0.00%     282.947us      47.158us      56.663ms        12.59%      56.663ms       9.444ms             6  
-                                ampere_sgemm_128x128_nn         0.00%       0.000us         0.00%       0.000us       0.000us      56.663ms        12.59%      56.663ms       9.444ms             6  
-                                            aten::copy_         3.75%     285.044ms         8.75%     664.131ms      13.510us      53.858ms        11.97%      53.860ms       1.096us         49158  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      53.855ms        11.97%      53.855ms       1.096us         49149  
-                                              aten::mul         3.08%     233.920ms         5.34%     405.684ms      16.501us      51.582ms        11.47%      51.587ms       2.098us         24585  
-                                              aten::add         3.87%     294.168ms         6.87%     521.854ms      14.354us      45.530ms        10.12%      45.534ms       1.252us         36357  
-void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      44.640ms         9.92%      44.640ms       1.816us         24576  
-void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us      34.166ms         7.59%      34.166ms       1.390us         24573  
-                                        aten::remainder         2.91%     220.707ms         4.59%     348.339ms      14.174us      30.841ms         6.86%      30.843ms       1.255us         24576  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      29.291ms         6.51%      29.291ms       1.192us         24573  
-void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us      26.632ms         5.92%      26.632ms       1.090us         24431  
-void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.258ms         3.61%      16.258ms       1.323us         12288  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      14.582ms         3.24%      14.582ms       1.187us         12288  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      12.272ms         2.73%      12.272ms       1.029us         11922  
+                                           binned_torch         0.00%       0.000us         0.00%       0.000us       0.000us        7.526s      1690.98%        7.526s        7.526s             1  
+                                           binned_torch        24.06%        1.811s       100.00%        7.528s        7.528s       0.000us         0.00%     445.109ms     445.109ms             1  
+                                             aten::item         1.62%     121.583ms        26.84%        2.020s      14.998us       0.000us         0.00%     138.816ms       1.030us        134715  
+                              aten::_local_scalar_dense         6.12%     460.388ms        25.22%        1.899s      14.095us     138.805ms        31.19%     138.816ms       1.030us        134715  
+                         Memcpy DtoH (Device -> Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us     138.805ms        31.19%     138.805ms       1.030us        134707  
+                                     aten::floor_divide         5.25%     395.063ms        12.72%     957.555ms      19.482us      61.331ms        13.78%      61.336ms       1.248us         49152  
+                                              aten::bmm         0.00%     238.536us         0.00%     289.618us      48.270us      57.304ms        12.88%      57.304ms       9.551ms             6  
+                                ampere_sgemm_128x128_nn         0.00%       0.000us         0.00%       0.000us       0.000us      57.304ms        12.88%      57.304ms       9.551ms             6  
+                                            aten::copy_         3.62%     272.274ms         8.61%     648.516ms      13.192us      53.873ms        12.10%      53.876ms       1.096us         49158  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      53.870ms        12.10%      53.870ms       1.096us         49149  
+                                              aten::mul         3.08%     231.551ms         5.44%     409.269ms      16.647us      51.546ms        11.58%      51.551ms       2.097us         24585  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      44.593ms        10.02%      44.593ms       1.814us         24576  
+                                              aten::add         4.08%     306.812ms         7.05%     530.578ms      14.594us      43.966ms         9.88%      43.969ms       1.209us         36357  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us      33.107ms         7.44%      33.107ms       1.347us         24573  
+                                        aten::remainder         2.97%     223.921ms         4.70%     353.632ms      14.389us      29.770ms         6.69%      29.775ms       1.211us         24577  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      28.225ms         6.34%      28.225ms       1.149us         24573  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us      25.583ms         5.75%      25.583ms       1.047us         24431  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us      15.722ms         3.53%      15.722ms       1.279us         12288  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      14.047ms         3.16%      14.047ms       1.143us         12288  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      11.757ms         2.64%      11.757ms       0.986us         11922  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 7.592s
-Self CUDA time total: 449.893ms
+Self CPU time total: 7.528s
+Self CUDA time total: 445.070ms
 
 
 impl                     wl                  p50(ms)  ok
-binned_torch             cuda_B1_S1024_E2     377.89  True
-binned_torch             cuda_B1_S1024_E4     408.91  True
-binned_torch             cuda_B1_S512_E2      158.27  True
-binned_torch             cuda_B1_S512_E4      209.01  True
-binned_torch             cuda_B4_S1024_E2    1516.51  True
-binned_torch             cuda_B4_S1024_E4    1643.14  True
-binned_torch             cuda_B4_S512_E2      769.64  True
-binned_torch             cuda_B4_S512_E4      816.95  True
+binned_torch             cuda_B1_S1024_E2     367.98  True
+binned_torch             cuda_B1_S1024_E4     396.30  True
+binned_torch             cuda_B1_S512_E2      154.35  True
+binned_torch             cuda_B1_S512_E4      195.55  True
+binned_torch             cuda_B4_S1024_E2    1510.09  True
+binned_torch             cuda_B4_S1024_E4    1618.05  True
+binned_torch             cuda_B4_S512_E2      733.47  True
+binned_torch             cuda_B4_S512_E4      787.61  True
 
▶ UV Install Logs
diff --git a/openai_moe/impls/gpt_oss_moe.html b/openai_moe/impls/gpt_oss_moe.html index de225cada7a98ff364623967547575c60542959f..f452175eaca2d311b1a3a3d71d18ed3f42a42bb8 100644 --- a/openai_moe/impls/gpt_oss_moe.html +++ b/openai_moe/impls/gpt_oss_moe.html @@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content { ▼ output ▶ uv-logs | -Cell: nv | 0.28s +Cell: nv | 0.25s | Raw @@ -3905,7 +3905,7 @@ Cell: nv | 0.28s
-
Fri Dec 19 19:41:48 2025       
+
Fri Dec 19 23:00:37 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 580.105.08             Driver Version: 580.105.08     CUDA Version: 13.0     |
 +-----------------------------------------+------------------------+----------------------+
@@ -3914,7 +3914,7 @@ Cell: nv | 0.28s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   33C    P0            126W /  350W |       0MiB /  46068MiB |    100%      Default |
+| N/A   40C    P0             84W /  350W |       0MiB /  46068MiB |     60%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -3938,7 +3938,7 @@ Cell: nv | 0.28s
 ▼ output
  ▶ uv-logs
  | 
-Cell: benchmark | 21.43s
+Cell: benchmark | 21.54s
  | 
 
 Raw
@@ -4042,29 +4042,29 @@ PROFILE TRACE: gpt_oss_experts | cuda_B1_S512_E2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                        gpt_oss_experts         0.00%       0.000us         0.00%       0.000us       0.000us      10.220ms       197.88%      10.220ms      10.220ms             1  
-                                        gpt_oss_experts        16.01%       2.006ms        99.94%      12.523ms      12.523ms       0.000us         0.00%       5.168ms       5.168ms             1  
-                                           aten::matmul         0.20%      24.744us         3.78%     473.582us      39.465us       0.000us         0.00%       4.543ms     378.565us            12  
-                                               aten::mm         2.31%     289.874us         3.58%     448.838us      37.403us       4.543ms        87.96%       4.543ms     378.565us            12  
-                                 ampere_sgemm_128x64_nn         0.00%       0.000us         0.00%       0.000us       0.000us       3.093ms        59.88%       3.093ms     343.626us             9  
-void cutlass::Kernel2<cutlass_80_simt_sgemm_128x64_8...         0.00%       0.000us         0.00%       0.000us       0.000us       1.444ms        27.95%       1.444ms     481.227us             3  
-                                              aten::mul         1.34%     167.604us         2.25%     281.908us      11.746us     108.865us         2.11%     108.865us       4.536us            24  
-                                              aten::add         1.61%     201.238us         3.79%     474.483us      26.360us     102.656us         1.99%     102.656us       5.703us            18  
-                                            aten::index         1.69%     212.259us         2.75%     345.042us      28.753us      88.512us         1.71%      88.512us       7.376us            12  
-                                       aten::index_add_         0.46%      58.122us         0.75%      94.202us      15.700us      80.160us         1.55%      80.160us      13.360us             6  
-void at::native::indexFuncLargeIndex<float, long, un...         0.00%       0.000us         0.00%       0.000us       0.000us      80.160us         1.55%      80.160us      13.360us             6  
-void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      80.000us         1.55%      80.000us       6.667us            12  
-                                          aten::nonzero         2.08%     261.099us         6.37%     797.848us      88.650us      65.246us         1.26%      76.095us       8.455us             9  
-                                            aten::clamp         0.95%     119.641us         1.55%     194.514us      16.209us      63.010us         1.22%      63.010us       5.251us            12  
-void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      63.010us         1.22%      63.010us       5.251us            12  
-                                            aten::where         0.06%       7.130us         5.02%     629.533us     104.922us       0.000us         0.00%      61.472us      10.245us             6  
-                                    aten::nonzero_numpy         0.09%      11.550us         4.97%     622.403us     103.734us       0.000us         0.00%      61.472us      10.245us             6  
-void at::native::vectorized_gather_kernel<16, long>(...         0.00%       0.000us         0.00%       0.000us       0.000us      60.800us         1.18%      60.800us      10.133us             6  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      56.608us         1.10%      56.608us       4.717us            12  
-                         Memcpy DtoH (Device -> Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us      50.776us         0.98%      50.776us       1.128us            45  
+                                        gpt_oss_experts         0.00%       0.000us         0.00%       0.000us       0.000us      10.236ms       195.72%      10.236ms      10.236ms             1  
+                                        gpt_oss_experts        16.81%       2.119ms        99.94%      12.602ms      12.602ms       0.000us         0.00%       5.233ms       5.233ms             1  
+                                           aten::matmul         0.21%      26.351us         3.80%     479.051us      39.921us       0.000us         0.00%       4.609ms     384.095us            12  
+                                               aten::mm         2.34%     295.677us         3.59%     452.700us      37.725us       4.609ms        88.13%       4.609ms     384.095us            12  
+                                 ampere_sgemm_128x64_nn         0.00%       0.000us         0.00%       0.000us       0.000us       3.132ms        59.89%       3.132ms     348.055us             9  
+void cutlass::Kernel2<cutlass_80_simt_sgemm_128x64_8...         0.00%       0.000us         0.00%       0.000us       0.000us       1.470ms        28.11%       1.470ms     490.007us             3  
+                                              aten::mul         1.25%     158.075us         2.09%     263.508us      10.979us     109.535us         2.09%     109.535us       4.564us            24  
+                                              aten::add         1.50%     188.607us         3.77%     475.033us      26.391us     103.232us         1.97%     103.232us       5.735us            18  
+                                            aten::index         1.58%     199.165us         2.64%     332.439us      27.703us      88.193us         1.69%      88.193us       7.349us            12  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      80.832us         1.55%      80.832us       6.736us            12  
+                                       aten::index_add_         0.43%      54.021us         0.70%      88.353us      14.726us      79.361us         1.52%      79.361us      13.227us             6  
+void at::native::indexFuncLargeIndex<float, long, un...         0.00%       0.000us         0.00%       0.000us       0.000us      79.361us         1.52%      79.361us      13.227us             6  
+                                          aten::nonzero         2.14%     269.616us         6.31%     796.127us      88.459us      63.904us         1.22%      74.560us       8.284us             9  
+                                            aten::clamp         0.90%     113.849us         1.52%     191.573us      15.964us      63.523us         1.21%      63.523us       5.294us            12  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      63.523us         1.21%      63.523us       5.294us            12  
+void at::native::vectorized_gather_kernel<16, long>(...         0.00%       0.000us         0.00%       0.000us       0.000us      60.767us         1.16%      60.767us      10.128us             6  
+                                            aten::where         0.06%       7.630us         5.01%     631.874us     105.312us       0.000us         0.00%      60.384us      10.064us             6  
+                                    aten::nonzero_numpy         0.10%      12.751us         4.95%     624.244us     104.041us       0.000us         0.00%      60.384us      10.064us             6  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      56.992us         1.09%      56.992us       4.749us            12  
+                         Memcpy DtoH (Device -> Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us      50.880us         0.97%      50.880us       1.131us            45  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 12.530ms
-Self CUDA time total: 5.165ms
+Self CPU time total: 12.609ms
+Self CUDA time total: 5.230ms
 
 
 
@@ -4074,29 +4074,29 @@ PROFILE TRACE: gpt_oss_experts | cuda_B1_S512_E4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                        gpt_oss_experts         0.00%       0.000us         0.00%       0.000us       0.000us      14.281ms       232.73%      14.281ms      14.281ms             1  
-                                        gpt_oss_experts        16.85%       2.763ms        99.97%      16.396ms      16.396ms       0.000us         0.00%       6.139ms       6.139ms             1  
-                                           aten::matmul         0.27%      44.470us         4.93%     808.156us      33.673us       0.000us         0.00%       5.322ms     221.756us            24  
-                                               aten::mm         2.81%     461.070us         4.66%     763.686us      31.820us       5.322ms        86.73%       5.322ms     221.756us            24  
-                                 ampere_sgemm_128x64_nn         0.00%       0.000us         0.00%       0.000us       0.000us       5.267ms        85.83%       5.267ms     219.440us            24  
-                                          aten::nonzero         2.44%     399.465us         7.84%       1.285ms      85.683us     115.131us         1.88%     137.882us       9.192us            15  
-                                              aten::mul         1.86%     305.625us         3.19%     523.892us      10.914us     131.841us         2.15%     131.841us       2.747us            48  
-                                              aten::add         2.10%     345.215us         3.57%     585.271us      16.258us     127.810us         2.08%     127.810us       3.550us            36  
-                                            aten::where         0.07%      10.792us         7.40%       1.214ms     101.132us       0.000us         0.00%     123.674us      10.306us            12  
-                                    aten::nonzero_numpy         0.13%      21.688us         7.33%       1.203ms     100.233us       0.000us         0.00%     123.674us      10.306us            12  
-                                            aten::index         2.22%     363.289us         3.85%     631.035us      26.293us     111.423us         1.82%     111.423us       4.643us            24  
-void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     101.762us         1.66%     101.762us       4.240us            24  
-                         Memcpy DtoH (Device -> Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us      91.773us         1.50%      91.773us       1.055us            87  
-                                            aten::clamp         1.29%     211.324us         2.19%     359.818us      14.992us      88.222us         1.44%      88.222us       3.676us            24  
-void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      88.222us         1.44%      88.222us       3.676us            24  
-                                             aten::item         0.47%      77.138us        37.50%       6.150ms      85.417us       0.000us         0.00%      75.678us       1.051us            72  
-                              aten::_local_scalar_dense         1.90%     311.363us        37.03%       6.073ms      84.345us      75.678us         1.23%      75.678us       1.051us            72  
-                                       aten::index_add_         0.59%      96.073us         0.99%     162.304us      13.525us      70.526us         1.15%      70.526us       5.877us            12  
-void at::native::indexFuncLargeIndex<float, long, un...         0.00%       0.000us         0.00%       0.000us       0.000us      70.526us         1.15%      70.526us       5.877us            12  
-void at::native::vectorized_gather_kernel<16, long>(...         0.00%       0.000us         0.00%       0.000us       0.000us      66.368us         1.08%      66.368us       5.531us            12  
+                                        gpt_oss_experts         0.00%       0.000us         0.00%       0.000us       0.000us      14.418ms       232.27%      14.418ms      14.418ms             1  
+                                        gpt_oss_experts        16.77%       2.777ms        99.97%      16.548ms      16.548ms       0.000us         0.00%       6.210ms       6.210ms             1  
+                                           aten::matmul         0.29%      47.549us         4.87%     805.573us      33.566us       0.000us         0.00%       5.399ms     224.951us            24  
+                                               aten::mm         2.86%     473.570us         4.58%     758.024us      31.584us       5.399ms        86.98%       5.399ms     224.951us            24  
+                                 ampere_sgemm_128x64_nn         0.00%       0.000us         0.00%       0.000us       0.000us       5.343ms        86.07%       5.343ms     222.609us            24  
+                                          aten::nonzero         2.46%     406.423us         7.79%       1.290ms      85.983us     112.737us         1.82%     135.233us       9.016us            15  
+                                              aten::mul         1.91%     315.499us         3.27%     541.644us      11.284us     131.458us         2.12%     131.458us       2.739us            48  
+                                              aten::add         2.09%     345.610us         3.58%     592.305us      16.453us     127.137us         2.05%     127.137us       3.532us            36  
+                                            aten::where         0.07%      11.421us         7.35%       1.217ms     101.380us       0.000us         0.00%     121.345us      10.112us            12  
+                                    aten::nonzero_numpy         0.14%      22.419us         7.28%       1.205ms     100.429us       0.000us         0.00%     121.345us      10.112us            12  
+                                            aten::index         2.37%     392.707us         3.98%     658.793us      27.450us     110.560us         1.78%     110.560us       4.607us            24  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     101.089us         1.63%     101.089us       4.212us            24  
+                         Memcpy DtoH (Device -> Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us      91.523us         1.47%      91.523us       1.052us            87  
+                                            aten::clamp         1.31%     216.727us         2.19%     362.649us      15.110us      87.299us         1.41%      87.299us       3.637us            24  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      87.299us         1.41%      87.299us       3.637us            24  
+                                             aten::item         0.49%      80.385us        37.67%       6.235ms      86.604us       0.000us         0.00%      75.204us       1.044us            72  
+                              aten::_local_scalar_dense         1.99%     329.728us        37.18%       6.155ms      85.487us      75.204us         1.21%      75.204us       1.044us            72  
+                                       aten::index_add_         0.56%      93.084us         0.97%     160.623us      13.385us      71.618us         1.15%      71.618us       5.968us            12  
+void at::native::indexFuncLargeIndex<float, long, un...         0.00%       0.000us         0.00%       0.000us       0.000us      71.618us         1.15%      71.618us       5.968us            12  
+void at::native::vectorized_gather_kernel<16, long>(...         0.00%       0.000us         0.00%       0.000us       0.000us      66.656us         1.07%      66.656us       5.555us            12  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 16.401ms
-Self CUDA time total: 6.136ms
+Self CPU time total: 16.554ms
+Self CUDA time total: 6.207ms
 
 
 
@@ -4106,29 +4106,29 @@ PROFILE TRACE: gpt_oss_experts | cuda_B1_S1024_E2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                        gpt_oss_experts         0.00%       0.000us         0.00%       0.000us       0.000us      12.623ms       150.27%      12.623ms      12.623ms             1  
-                                        gpt_oss_experts        13.47%       1.791ms        99.96%      13.283ms      13.283ms       0.000us         0.00%       8.405ms       8.405ms             1  
-                                           aten::matmul         0.18%      23.339us         3.36%     446.659us      37.222us       0.000us         0.00%       7.382ms     615.173us            12  
-                                               aten::mm         1.99%     264.803us         3.19%     423.320us      35.277us       7.382ms        87.88%       7.382ms     615.173us            12  
-void cutlass::Kernel2<cutlass_80_simt_sgemm_256x128_...         0.00%       0.000us         0.00%       0.000us       0.000us       4.494ms        53.50%       4.494ms     748.960us             6  
-                                 ampere_sgemm_128x64_nn         0.00%       0.000us         0.00%       0.000us       0.000us       1.479ms        17.61%       1.479ms     493.131us             3  
-void cutlass::Kernel2<cutlass_80_simt_sgemm_128x64_8...         0.00%       0.000us         0.00%       0.000us       0.000us       1.402ms        16.69%       1.402ms     467.413us             3  
-                                              aten::mul         1.17%     155.791us         2.03%     269.215us      11.217us     193.439us         2.30%     193.439us       8.060us            24  
-                                              aten::add         1.34%     178.665us         2.34%     311.318us      17.295us     184.286us         2.19%     184.286us      10.238us            18  
-                                       aten::index_add_         0.37%      48.760us         0.64%      85.661us      14.277us     167.358us         1.99%     167.358us      27.893us             6  
-void at::native::indexFuncLargeIndex<float, long, un...         0.00%       0.000us         0.00%       0.000us       0.000us     167.358us         1.99%     167.358us      27.893us             6  
-                                            aten::index         1.43%     189.705us         2.42%     321.187us      26.766us     146.945us         1.75%     146.945us      12.245us            12  
-void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     145.824us         1.74%     145.824us      12.152us            12  
-void at::native::vectorized_gather_kernel<16, long>(...         0.00%       0.000us         0.00%       0.000us       0.000us     116.832us         1.39%     116.832us      19.472us             6  
-                                            aten::clamp         0.82%     108.995us         1.40%     185.495us      15.458us     109.284us         1.30%     109.284us       9.107us            12  
-void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     109.284us         1.30%     109.284us       9.107us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     103.135us         1.23%     103.135us       8.595us            12  
-                                          aten::nonzero         1.83%     243.374us         5.76%     765.236us      85.026us      70.402us         0.84%      81.794us       9.088us             9  
-                                            aten::where         0.04%       5.651us         4.63%     615.153us     102.525us       0.000us         0.00%      66.851us      11.142us             6  
-                                    aten::nonzero_numpy         0.08%      11.009us         4.59%     609.502us     101.584us       0.000us         0.00%      66.851us      11.142us             6  
+                                        gpt_oss_experts         0.00%       0.000us         0.00%       0.000us       0.000us      12.715ms       148.98%      12.715ms      12.715ms             1  
+                                        gpt_oss_experts        13.24%       1.769ms        99.96%      13.348ms      13.348ms       0.000us         0.00%       8.540ms       8.540ms             1  
+                                           aten::matmul         0.18%      23.619us         3.35%     447.210us      37.267us       0.000us         0.00%       7.511ms     625.895us            12  
+                                               aten::mm         1.99%     265.185us         3.17%     423.591us      35.299us       7.511ms        88.01%       7.511ms     625.895us            12  
+void cutlass::Kernel2<cutlass_80_simt_sgemm_256x128_...         0.00%       0.000us         0.00%       0.000us       0.000us       4.572ms        53.58%       4.572ms     762.082us             6  
+                                 ampere_sgemm_128x64_nn         0.00%       0.000us         0.00%       0.000us       0.000us       1.497ms        17.54%       1.497ms     498.892us             3  
+void cutlass::Kernel2<cutlass_80_simt_sgemm_128x64_8...         0.00%       0.000us         0.00%       0.000us       0.000us       1.435ms        16.81%       1.435ms     478.305us             3  
+                                              aten::mul         1.21%     162.011us         2.06%     274.994us      11.458us     197.600us         2.32%     197.600us       8.233us            24  
+                                              aten::add         1.32%     176.183us         2.25%     300.545us      16.697us     188.546us         2.21%     188.546us      10.475us            18  
+                                       aten::index_add_         0.35%      46.949us         0.64%      86.050us      14.342us     164.416us         1.93%     164.416us      27.403us             6  
+void at::native::indexFuncLargeIndex<float, long, un...         0.00%       0.000us         0.00%       0.000us       0.000us     164.416us         1.93%     164.416us      27.403us             6  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     149.442us         1.75%     149.442us      12.453us            12  
+                                            aten::index         1.39%     185.093us         2.39%     318.747us      26.562us     146.144us         1.71%     146.144us      12.179us            12  
+void at::native::vectorized_gather_kernel<16, long>(...         0.00%       0.000us         0.00%       0.000us       0.000us     116.287us         1.36%     116.287us      19.381us             6  
+                                            aten::clamp         0.82%     108.858us         1.40%     187.503us      15.625us     110.850us         1.30%     110.850us       9.238us            12  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     110.850us         1.30%     110.850us       9.238us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     104.960us         1.23%     104.960us       8.747us            12  
+                                          aten::nonzero         1.82%     243.314us         5.65%     754.927us      83.881us      69.183us         0.81%      80.703us       8.967us             9  
+                                            aten::where         0.04%       5.842us         4.63%     617.944us     102.991us       0.000us         0.00%      66.080us      11.013us             6  
+                                    aten::nonzero_numpy         0.08%      11.348us         4.58%     612.102us     102.017us       0.000us         0.00%      66.080us      11.013us             6  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 13.289ms
-Self CUDA time total: 8.400ms
+Self CPU time total: 13.354ms
+Self CUDA time total: 8.534ms
 
 
 
@@ -4138,29 +4138,29 @@ PROFILE TRACE: gpt_oss_experts | cuda_B1_S1024_E4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                        gpt_oss_experts         0.00%       0.000us         0.00%       0.000us       0.000us      18.138ms       172.84%      18.138ms      18.138ms             1  
-                                        gpt_oss_experts        12.76%       2.622ms        99.97%      20.540ms      20.540ms       0.000us         0.00%      10.500ms      10.500ms             1  
-                                           aten::matmul         0.22%      44.749us         4.11%     844.232us      35.176us       0.000us         0.00%       9.224ms     384.346us            24  
-                                               aten::mm         2.32%     476.088us         3.89%     799.483us      33.312us       9.224ms        87.90%       9.224ms     384.346us            24  
-                                 ampere_sgemm_128x64_nn         0.00%       0.000us         0.00%       0.000us       0.000us       6.287ms        59.90%       6.287ms     349.259us            18  
-void cutlass::Kernel2<cutlass_80_simt_sgemm_128x64_8...         0.00%       0.000us         0.00%       0.000us       0.000us       2.925ms        27.87%       2.925ms     487.438us             6  
-                                              aten::mul         1.51%     311.093us         2.62%     538.833us      11.226us     229.793us         2.19%     229.793us       4.787us            48  
-                                              aten::add         1.68%     344.530us         2.88%     592.257us      16.452us     211.009us         2.01%     211.009us       5.861us            36  
-                                            aten::index         1.75%     359.041us         3.02%     619.685us      25.820us     205.054us         1.95%     205.054us       8.544us            24  
-void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     164.639us         1.57%     164.639us       6.860us            24  
-                                       aten::index_add_         0.48%      97.780us         0.85%     174.953us      14.579us     157.631us         1.50%     157.631us      13.136us            12  
-void at::native::indexFuncLargeIndex<float, long, un...         0.00%       0.000us         0.00%       0.000us       0.000us     157.631us         1.50%     157.631us      13.136us            12  
-                                          aten::nonzero         1.89%     388.553us         6.17%       1.268ms      84.506us     122.654us         1.17%     146.847us       9.790us            15  
-void at::native::vectorized_gather_kernel<16, long>(...         0.00%       0.000us         0.00%       0.000us       0.000us     145.663us         1.39%     145.663us      12.139us            12  
-                                            aten::where         0.05%      10.471us         5.79%       1.190ms      99.134us       0.000us         0.00%     132.128us      11.011us            12  
-                                    aten::nonzero_numpy         0.10%      21.340us         5.74%       1.179ms      98.262us       0.000us         0.00%     132.128us      11.011us            12  
-                                            aten::clamp         1.02%     209.010us         1.74%     358.311us      14.930us     131.327us         1.25%     131.327us       5.472us            24  
-void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     131.327us         1.25%     131.327us       5.472us            24  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     117.601us         1.12%     117.601us       4.900us            24  
-                         Memcpy DtoH (Device -> Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us     108.253us         1.03%     108.253us       1.244us            87  
+                                        gpt_oss_experts         0.00%       0.000us         0.00%       0.000us       0.000us      18.371ms       173.06%      18.371ms      18.371ms             1  
+                                        gpt_oss_experts        12.78%       2.670ms        99.97%      20.895ms      20.895ms       0.000us         0.00%      10.621ms      10.621ms             1  
+                                           aten::matmul         0.23%      47.482us         3.94%     823.658us      34.319us       0.000us         0.00%       9.337ms     389.038us            24  
+                                               aten::mm         2.27%     474.301us         3.71%     776.176us      32.341us       9.337ms        87.96%       9.337ms     389.038us            24  
+                                 ampere_sgemm_128x64_nn         0.00%       0.000us         0.00%       0.000us       0.000us       6.375ms        60.06%       6.375ms     354.186us            18  
+void cutlass::Kernel2<cutlass_80_simt_sgemm_128x64_8...         0.00%       0.000us         0.00%       0.000us       0.000us       2.948ms        27.77%       2.948ms     491.399us             6  
+                                              aten::mul         1.63%     341.535us         2.71%     565.653us      11.784us     233.052us         2.20%     233.052us       4.855us            48  
+                                              aten::add         1.65%     343.966us         2.82%     589.773us      16.383us     214.333us         2.02%     214.333us       5.954us            36  
+                                            aten::index         1.71%     356.851us         2.95%     617.053us      25.711us     204.352us         1.93%     204.352us       8.515us            24  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     167.774us         1.58%     167.774us       6.991us            24  
+                                       aten::index_add_         0.45%      94.502us         0.77%     161.933us      13.494us     156.322us         1.47%     156.322us      13.027us            12  
+void at::native::indexFuncLargeIndex<float, long, un...         0.00%       0.000us         0.00%       0.000us       0.000us     156.322us         1.47%     156.322us      13.027us            12  
+                                          aten::nonzero         1.91%     398.170us         6.16%       1.287ms      85.805us     122.527us         1.15%     147.135us       9.809us            15  
+void at::native::vectorized_gather_kernel<16, long>(...         0.00%       0.000us         0.00%       0.000us       0.000us     146.240us         1.38%     146.240us      12.187us            12  
+                                            aten::clamp         1.04%     217.693us         1.76%     368.516us      15.355us     133.438us         1.26%     133.438us       5.560us            24  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     133.438us         1.26%     133.438us       5.560us            24  
+                                            aten::where         0.05%      11.100us         5.81%       1.214ms     101.204us       0.000us         0.00%     132.577us      11.048us            12  
+                                    aten::nonzero_numpy         0.10%      21.341us         5.76%       1.203ms     100.279us       0.000us         0.00%     132.577us      11.048us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     119.358us         1.12%     119.358us       4.973us            24  
+                         Memcpy DtoH (Device -> Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us     108.671us         1.02%     108.671us       1.249us            87  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 20.546ms
-Self CUDA time total: 10.495ms
+Self CPU time total: 20.901ms
+Self CUDA time total: 10.615ms
 
 
 
@@ -4170,29 +4170,29 @@ PROFILE TRACE: gpt_oss_experts | cuda_B4_S512_E2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                        gpt_oss_experts         0.00%       0.000us         0.00%       0.000us       0.000us      20.935ms       121.00%      20.935ms      20.935ms             1  
-                                        gpt_oss_experts         7.61%       1.780ms        99.98%      23.376ms      23.376ms       0.000us         0.00%      17.312ms      17.312ms             1  
-                                           aten::matmul         0.10%      23.122us         1.96%     458.772us      38.231us       0.000us         0.00%      14.468ms       1.206ms            12  
-                                               aten::mm         1.15%     269.268us         1.86%     435.650us      36.304us      14.468ms        83.62%      14.468ms       1.206ms            12  
-void cutlass::Kernel2<cutlass_80_simt_sgemm_256x128_...         0.00%       0.000us         0.00%       0.000us       0.000us       8.827ms        51.02%       8.827ms       1.471ms             6  
-                                 ampere_sgemm_128x64_nn         0.00%       0.000us         0.00%       0.000us       0.000us       5.632ms        32.55%       5.632ms     938.689us             6  
-                                              aten::add         0.79%     184.599us         1.36%     318.590us      17.699us     771.593us         4.46%     771.593us      42.866us            18  
-                                              aten::mul         0.68%     158.205us         1.17%     272.787us      11.366us     648.706us         3.75%     648.706us      27.029us            24  
-void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     492.134us         2.84%     492.134us      41.011us            12  
-                                       aten::index_add_         0.22%      51.621us         0.39%      91.292us      15.215us     449.187us         2.60%     449.187us      74.864us             6  
-void at::native::indexFuncLargeIndex<float, long, un...         0.00%       0.000us         0.00%       0.000us       0.000us     449.187us         2.60%     449.187us      74.864us             6  
-                                            aten::clamp         0.47%     109.062us         0.80%     186.384us      15.532us     328.069us         1.90%     328.069us      27.339us            12  
-void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     328.069us         1.90%     328.069us      27.339us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     298.432us         1.72%     298.432us      49.739us             6  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     279.459us         1.62%     279.459us      46.576us             6  
-                                            aten::index         0.79%     185.644us         1.37%     320.365us      26.697us     259.362us         1.50%     259.362us      21.614us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     252.002us         1.46%     252.002us      21.000us            12  
-void at::native::vectorized_gather_kernel<16, long>(...         0.00%       0.000us         0.00%       0.000us       0.000us     226.817us         1.31%     226.817us      37.803us             6  
-                                          aten::sigmoid         0.16%      37.651us         0.31%      72.093us      12.016us     177.249us         1.02%     177.249us      29.542us             6  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     177.249us         1.02%     177.249us      29.542us             6  
+                                        gpt_oss_experts         0.00%       0.000us         0.00%       0.000us       0.000us      21.147ms       120.13%      21.147ms      21.147ms             1  
+                                        gpt_oss_experts         7.48%       1.759ms        99.98%      23.501ms      23.501ms       0.000us         0.00%      17.613ms      17.613ms             1  
+                                           aten::matmul         0.10%      24.413us         1.93%     452.632us      37.719us       0.000us         0.00%      14.754ms       1.229ms            12  
+                                               aten::mm         1.14%     267.578us         1.82%     428.219us      35.685us      14.754ms        83.81%      14.754ms       1.229ms            12  
+void cutlass::Kernel2<cutlass_80_simt_sgemm_256x128_...         0.00%       0.000us         0.00%       0.000us       0.000us       9.005ms        51.15%       9.005ms       1.501ms             6  
+                                 ampere_sgemm_128x64_nn         0.00%       0.000us         0.00%       0.000us       0.000us       5.740ms        32.61%       5.740ms     956.646us             6  
+                                              aten::add         0.80%     187.171us         1.34%     315.717us      17.540us     774.145us         4.40%     774.145us      43.008us            18  
+                                              aten::mul         0.68%     160.882us         1.16%     272.615us      11.359us     660.967us         3.75%     660.967us      27.540us            24  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     494.017us         2.81%     494.017us      41.168us            12  
+                                       aten::index_add_         0.20%      46.930us         0.35%      82.651us      13.775us     446.818us         2.54%     446.818us      74.470us             6  
+void at::native::indexFuncLargeIndex<float, long, un...         0.00%       0.000us         0.00%       0.000us       0.000us     446.818us         2.54%     446.818us      74.470us             6  
+                                            aten::clamp         0.49%     114.212us         0.82%     193.704us      16.142us     330.081us         1.88%     330.081us      27.507us            12  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     330.081us         1.88%     330.081us      27.507us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     303.524us         1.72%     303.524us      50.587us             6  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     280.128us         1.59%     280.128us      46.688us             6  
+                                            aten::index         0.79%     185.142us         1.34%     314.927us      26.244us     260.002us         1.48%     260.002us      21.667us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     255.139us         1.45%     255.139us      21.262us            12  
+void at::native::vectorized_gather_kernel<16, long>(...         0.00%       0.000us         0.00%       0.000us       0.000us     227.361us         1.29%     227.361us      37.894us             6  
+                                          aten::sigmoid         0.17%      39.139us         0.29%      67.081us      11.180us     175.681us         1.00%     175.681us      29.280us             6  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     175.681us         1.00%     175.681us      29.280us             6  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 23.381ms
-Self CUDA time total: 17.302ms
+Self CPU time total: 23.507ms
+Self CUDA time total: 17.603ms
 
 
 
@@ -4202,29 +4202,29 @@ PROFILE TRACE: gpt_oss_experts | cuda_B4_S512_E4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                        gpt_oss_experts         0.00%       0.000us         0.00%       0.000us       0.000us      24.710ms       141.76%      24.710ms      24.710ms             1  
-                                        gpt_oss_experts        10.14%       2.749ms        99.98%      27.106ms      27.106ms       0.000us         0.00%      17.441ms      17.441ms             1  
-                                           aten::matmul         0.17%      45.968us         3.40%     922.464us      38.436us       0.000us         0.00%      15.230ms     634.586us            24  
-                                               aten::mm         2.05%     556.479us         3.23%     876.496us      36.521us      15.230ms        87.37%      15.230ms     634.586us            24  
-void cutlass::Kernel2<cutlass_80_simt_sgemm_256x128_...         0.00%       0.000us         0.00%       0.000us       0.000us       9.172ms        52.62%       9.172ms     764.334us            12  
-                                 ampere_sgemm_128x64_nn         0.00%       0.000us         0.00%       0.000us       0.000us       3.147ms        18.05%       3.147ms     524.452us             6  
-void cutlass::Kernel2<cutlass_80_simt_sgemm_128x64_8...         0.00%       0.000us         0.00%       0.000us       0.000us       2.898ms        16.62%       2.898ms     482.943us             6  
-                                              aten::add         1.29%     350.116us         2.26%     613.465us      17.041us     420.321us         2.41%     420.321us      11.676us            36  
-                                              aten::mul         1.13%     307.419us         1.97%     533.015us      11.104us     413.571us         2.37%     413.571us       8.616us            48  
-                                       aten::index_add_         0.36%      98.853us         0.63%     169.455us      14.121us     380.323us         2.18%     380.323us      31.694us            12  
-void at::native::indexFuncLargeIndex<float, long, un...         0.00%       0.000us         0.00%       0.000us       0.000us     380.323us         2.18%     380.323us      31.694us            12  
-                                            aten::index         1.34%     364.187us         2.36%     638.760us      26.615us     342.626us         1.97%     342.626us      14.276us            24  
-void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     337.185us         1.93%     337.185us      14.049us            24  
-void at::native::vectorized_gather_kernel<16, long>(...         0.00%       0.000us         0.00%       0.000us       0.000us     278.754us         1.60%     278.754us      23.230us            12  
-                                            aten::clamp         0.81%     219.710us         1.37%     372.721us      15.530us     226.367us         1.30%     226.367us       9.432us            24  
-void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     226.367us         1.30%     226.367us       9.432us            24  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     219.298us         1.26%     219.298us       9.137us            24  
-                                          aten::nonzero         1.48%     402.204us         4.91%       1.331ms      88.732us     129.571us         0.74%     155.747us      10.383us            15  
-                                            aten::where         0.04%      10.572us         4.67%       1.267ms     105.600us       0.000us         0.00%     139.970us      11.664us            12  
-                                    aten::nonzero_numpy         0.08%      21.969us         4.64%       1.257ms     104.719us       0.000us         0.00%     139.970us      11.664us            12  
+                                        gpt_oss_experts         0.00%       0.000us         0.00%       0.000us       0.000us      24.812ms       140.52%      24.812ms      24.812ms             1  
+                                        gpt_oss_experts        10.20%       2.768ms        99.98%      27.139ms      27.139ms       0.000us         0.00%      17.668ms      17.668ms             1  
+                                           aten::matmul         0.17%      47.070us         3.25%     881.530us      36.730us       0.000us         0.00%      15.436ms     643.168us            24  
+                                               aten::mm         1.94%     525.958us         3.07%     834.460us      34.769us      15.436ms        87.42%      15.436ms     643.168us            24  
+void cutlass::Kernel2<cutlass_80_simt_sgemm_256x128_...         0.00%       0.000us         0.00%       0.000us       0.000us       9.298ms        52.66%       9.298ms     774.816us            12  
+                                 ampere_sgemm_128x64_nn         0.00%       0.000us         0.00%       0.000us       0.000us       3.185ms        18.04%       3.185ms     530.803us             6  
+void cutlass::Kernel2<cutlass_80_simt_sgemm_128x64_8...         0.00%       0.000us         0.00%       0.000us       0.000us       2.939ms        16.65%       2.939ms     489.897us             6  
+                                              aten::add         1.32%     358.751us         2.25%     610.989us      16.972us     429.537us         2.43%     429.537us      11.932us            36  
+                                              aten::mul         1.17%     318.045us         2.01%     546.157us      11.378us     419.555us         2.38%     419.555us       8.741us            48  
+                                       aten::index_add_         0.35%      93.791us         0.61%     165.384us      13.782us     375.712us         2.13%     375.712us      31.309us            12  
+void at::native::indexFuncLargeIndex<float, long, un...         0.00%       0.000us         0.00%       0.000us       0.000us     375.712us         2.13%     375.712us      31.309us            12  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     344.672us         1.95%     344.672us      14.361us            24  
+                                            aten::index         1.36%     368.555us         2.35%     637.581us      26.566us     343.779us         1.95%     343.779us      14.324us            24  
+void at::native::vectorized_gather_kernel<16, long>(...         0.00%       0.000us         0.00%       0.000us       0.000us     279.524us         1.58%     279.524us      23.294us            12  
+                                            aten::clamp         0.81%     220.839us         1.38%     373.627us      15.568us     232.100us         1.31%     232.100us       9.671us            24  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     232.100us         1.31%     232.100us       9.671us            24  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     222.273us         1.26%     222.273us       9.261us            24  
+                                          aten::nonzero         1.49%     404.133us         4.81%       1.304ms      86.953us     129.285us         0.73%     155.591us      10.373us            15  
+                                            aten::where         0.04%      11.801us         4.54%       1.232ms     102.652us       0.000us         0.00%     140.134us      11.678us            12  
+                                    aten::nonzero_numpy         0.08%      22.919us         4.49%       1.220ms     101.669us       0.000us         0.00%     140.134us      11.678us            12  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 27.112ms
-Self CUDA time total: 17.431ms
+Self CPU time total: 27.144ms
+Self CUDA time total: 17.658ms
 
 
 
@@ -4234,29 +4234,29 @@ PROFILE TRACE: gpt_oss_experts | cuda_B4_S1024_E2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                        gpt_oss_experts         0.00%       0.000us         0.00%       0.000us       0.000us      40.438ms       109.96%      40.438ms      40.438ms             1  
-                                        gpt_oss_experts         4.40%       1.882ms        99.82%      42.728ms      42.728ms       0.000us         0.00%      36.808ms      36.808ms             1  
-                                           aten::matmul         0.05%      22.249us         1.02%     438.421us      36.535us       0.000us         0.00%      26.813ms       2.234ms            12  
-                                               aten::mm         0.66%     281.965us         0.97%     416.172us      34.681us      26.813ms        72.91%      26.813ms       2.234ms            12  
-void cutlass::Kernel2<cutlass_80_simt_sgemm_256x128_...         0.00%       0.000us         0.00%       0.000us       0.000us      26.809ms        72.90%      26.809ms       2.234ms            12  
-                                              aten::mul         0.40%     169.436us         0.68%     291.368us      12.140us       2.973ms         8.09%       2.973ms     123.894us            24  
-                                              aten::add         0.45%     194.095us         1.09%     466.694us      25.927us       2.399ms         6.52%       2.399ms     133.270us            18  
-                                            aten::clamp         0.28%     118.373us         0.48%     205.484us      17.124us       2.385ms         6.49%       2.385ms     198.780us            12  
-void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       2.385ms         6.49%       2.385ms     198.780us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       1.983ms         5.39%       1.983ms     165.284us            12  
-void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       1.623ms         4.41%       1.623ms     135.241us            12  
-                                       aten::index_add_         0.12%      50.121us         0.21%      88.453us      14.742us     929.513us         2.53%     929.513us     154.919us             6  
-void at::native::indexFuncLargeIndex<float, long, un...         0.00%       0.000us         0.00%       0.000us       0.000us     929.513us         2.53%     929.513us     154.919us             6  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     775.973us         2.11%     775.973us     129.329us             6  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     743.622us         2.02%     743.622us     123.937us             6  
-                                            aten::index         0.44%     190.163us         0.78%     332.417us      27.701us     705.798us         1.92%     705.798us      58.816us            12  
-void at::native::vectorized_gather_kernel<16, long>(...         0.00%       0.000us         0.00%       0.000us       0.000us     672.133us         1.83%     672.133us     112.022us             6  
-                                          aten::sigmoid         0.10%      42.342us         0.17%      71.992us      11.999us     317.635us         0.86%     317.635us      52.939us             6  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     317.635us         0.86%     317.635us      52.939us             6  
-void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     246.434us         0.67%     246.434us      41.072us             6  
+                                        gpt_oss_experts         0.00%       0.000us         0.00%       0.000us       0.000us      40.716ms       109.44%      40.716ms      40.716ms             1  
+                                        gpt_oss_experts         4.15%       1.782ms        99.82%      42.848ms      42.848ms       0.000us         0.00%      37.235ms      37.235ms             1  
+                                           aten::matmul         0.05%      22.008us         1.00%     427.588us      35.632us       0.000us         0.00%      27.249ms       2.271ms            12  
+                                               aten::mm         0.64%     276.436us         0.94%     405.580us      33.798us      27.249ms        73.24%      27.249ms       2.271ms            12  
+void cutlass::Kernel2<cutlass_80_simt_sgemm_256x128_...         0.00%       0.000us         0.00%       0.000us       0.000us      27.245ms        73.23%      27.245ms       2.270ms            12  
+                                              aten::mul         0.38%     162.893us         0.65%     277.866us      11.578us       2.967ms         7.97%       2.967ms     123.619us            24  
+                                              aten::add         0.45%     194.205us         1.07%     458.802us      25.489us       2.398ms         6.45%       2.398ms     133.242us            18  
+                                            aten::clamp         0.26%     112.402us         0.45%     191.453us      15.954us       2.384ms         6.41%       2.384ms     198.708us            12  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       2.384ms         6.41%       2.384ms     198.708us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       1.985ms         5.34%       1.985ms     165.412us            12  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       1.626ms         4.37%       1.626ms     135.484us            12  
+                                       aten::index_add_         0.11%      46.550us         0.19%      83.331us      13.889us     923.493us         2.48%     923.493us     153.916us             6  
+void at::native::indexFuncLargeIndex<float, long, un...         0.00%       0.000us         0.00%       0.000us       0.000us     923.493us         2.48%     923.493us     153.916us             6  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     772.550us         2.08%     772.550us     128.758us             6  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     736.421us         1.98%     736.421us     122.737us             6  
+                                            aten::index         0.43%     184.050us         0.73%     314.765us      26.230us     705.700us         1.90%     705.700us      58.808us            12  
+void at::native::vectorized_gather_kernel<16, long>(...         0.00%       0.000us         0.00%       0.000us       0.000us     672.068us         1.81%     672.068us     112.011us             6  
+                                          aten::sigmoid         0.09%      40.702us         0.16%      68.501us      11.417us     324.705us         0.87%     324.705us      54.117us             6  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     324.705us         0.87%     324.705us      54.117us             6  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     245.504us         0.66%     245.504us      40.917us             6  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 42.805ms
-Self CUDA time total: 36.776ms
+Self CPU time total: 42.926ms
+Self CUDA time total: 37.203ms
 
 
 
@@ -4266,40 +4266,40 @@ PROFILE TRACE: gpt_oss_experts | cuda_B4_S1024_E4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                        gpt_oss_experts         0.00%       0.000us         0.00%       0.000us       0.000us      40.917ms       118.34%      40.917ms      40.917ms             1  
-                                        gpt_oss_experts         6.54%       2.832ms        99.99%      43.320ms      43.320ms       0.000us         0.00%      34.594ms      34.594ms             1  
-                                           aten::matmul         0.11%      46.003us         2.16%     933.683us      38.903us       0.000us         0.00%      28.640ms       1.193ms            24  
-                                               aten::mm         1.27%     551.595us         2.05%     887.680us      36.987us      28.640ms        82.83%      28.640ms       1.193ms            24  
-void cutlass::Kernel2<cutlass_80_simt_sgemm_256x128_...         0.00%       0.000us         0.00%       0.000us       0.000us      20.238ms        58.53%      20.238ms       1.349ms            15  
-                                 ampere_sgemm_128x64_nn         0.00%       0.000us         0.00%       0.000us       0.000us       8.385ms        24.25%       8.385ms     931.701us             9  
-                                              aten::add         0.85%     367.713us         1.47%     637.625us      17.712us       1.485ms         4.30%       1.485ms      41.254us            36  
-                                              aten::mul         0.73%     317.651us         1.28%     554.606us      11.554us       1.368ms         3.96%       1.368ms      28.495us            48  
-void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     932.164us         2.70%     932.164us      38.840us            24  
-                                       aten::index_add_         0.23%      99.030us         0.39%     170.492us      14.208us     912.225us         2.64%     912.225us      76.019us            12  
-void at::native::indexFuncLargeIndex<float, long, un...         0.00%       0.000us         0.00%       0.000us       0.000us     912.225us         2.64%     912.225us      76.019us            12  
-                                            aten::clamp         0.52%     223.402us         0.90%     389.994us      16.250us     772.775us         2.24%     772.775us      32.199us            24  
-void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     772.775us         2.24%     772.775us      32.199us            24  
-                                            aten::index         0.84%     365.911us         1.48%     641.837us      26.743us     652.128us         1.89%     652.128us      27.172us            24  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     646.273us         1.87%     646.273us      53.856us            12  
-void at::native::vectorized_gather_kernel<16, long>(...         0.00%       0.000us         0.00%       0.000us       0.000us     582.113us         1.68%     582.113us      48.509us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     552.993us         1.60%     552.993us      46.083us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     519.810us         1.50%     519.810us      21.659us            24  
-                                          aten::sigmoid         0.18%      79.593us         0.31%     135.883us      11.324us     361.471us         1.05%     361.471us      30.123us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     361.471us         1.05%     361.471us      30.123us            12  
+                                        gpt_oss_experts         0.00%       0.000us         0.00%       0.000us       0.000us      41.326ms       117.97%      41.326ms      41.326ms             1  
+                                        gpt_oss_experts         6.48%       2.843ms        99.99%      43.865ms      43.865ms       0.000us         0.00%      35.050ms      35.050ms             1  
+                                           aten::matmul         0.11%      47.091us         2.05%     900.896us      37.537us       0.000us         0.00%      29.086ms       1.212ms            24  
+                                               aten::mm         1.22%     537.124us         1.95%     853.805us      35.575us      29.086ms        83.03%      29.086ms       1.212ms            24  
+void cutlass::Kernel2<cutlass_80_simt_sgemm_256x128_...         0.00%       0.000us         0.00%       0.000us       0.000us      20.524ms        58.59%      20.524ms       1.368ms            15  
+                                 ampere_sgemm_128x64_nn         0.00%       0.000us         0.00%       0.000us       0.000us       8.546ms        24.39%       8.546ms     949.503us             9  
+                                              aten::add         0.83%     362.842us         1.41%     616.516us      17.125us       1.481ms         4.23%       1.481ms      41.132us            36  
+                                              aten::mul         0.72%     316.599us         1.22%     535.905us      11.165us       1.379ms         3.94%       1.379ms      28.736us            48  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     928.582us         2.65%     928.582us      38.691us            24  
+                                       aten::index_add_         0.22%      95.553us         0.38%     168.433us      14.036us     914.346us         2.61%     914.346us      76.195us            12  
+void at::native::indexFuncLargeIndex<float, long, un...         0.00%       0.000us         0.00%       0.000us       0.000us     914.346us         2.61%     914.346us      76.195us            12  
+                                            aten::clamp         0.51%     224.207us         0.87%     380.890us      15.870us     772.996us         2.21%     772.996us      32.208us            24  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     772.996us         2.21%     772.996us      32.208us            24  
+                                            aten::index         0.86%     378.436us         1.47%     642.801us      26.783us     657.670us         1.88%     657.670us      27.403us            24  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     653.092us         1.86%     653.092us      54.424us            12  
+void at::native::vectorized_gather_kernel<16, long>(...         0.00%       0.000us         0.00%       0.000us       0.000us     586.630us         1.67%     586.630us      48.886us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     552.162us         1.58%     552.162us      46.014us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     522.306us         1.49%     522.306us      21.763us            24  
+                                          aten::sigmoid         0.20%      86.392us         0.33%     145.153us      12.096us     354.306us         1.01%     354.306us      29.525us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     354.306us         1.01%     354.306us      29.525us            12  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 43.326ms
-Self CUDA time total: 34.575ms
+Self CPU time total: 43.870ms
+Self CUDA time total: 35.030ms
 
 
 impl                     wl                  p50(ms)  ok
-gpt_oss_experts          cuda_B1_S1024_E2       3.85  True
-gpt_oss_experts          cuda_B1_S1024_E4       5.31  True
-gpt_oss_experts          cuda_B1_S512_E2        2.63  True
-gpt_oss_experts          cuda_B1_S512_E4        3.93  True
-gpt_oss_experts          cuda_B4_S1024_E2      13.24  True
-gpt_oss_experts          cuda_B4_S1024_E4      13.36  True
-gpt_oss_experts          cuda_B4_S512_E2        6.72  True
-gpt_oss_experts          cuda_B4_S512_E4        7.52  True
+gpt_oss_experts          cuda_B1_S1024_E2       3.87  True
+gpt_oss_experts          cuda_B1_S1024_E4       5.34  True
+gpt_oss_experts          cuda_B1_S512_E2        2.66  True
+gpt_oss_experts          cuda_B1_S512_E4        3.95  True
+gpt_oss_experts          cuda_B4_S1024_E2      13.39  True
+gpt_oss_experts          cuda_B4_S1024_E4      13.41  True
+gpt_oss_experts          cuda_B4_S512_E2        6.80  True
+gpt_oss_experts          cuda_B4_S512_E4        7.53  True
 
▶ UV Install Logs
@@ -4308,12 +4308,12 @@ gpt_oss_experts cuda_B4_S512_E4 7.52 True Updated https://github.com/huggingface/kernels.git (55b7c980e96bf5f747f0e4136be61c0b089ab76c) Building kernels @ git+https://github.com/huggingface/kernels.git@55b7c980e96bf5f747f0e4136be61c0b089ab76c Built kernels @ git+https://github.com/huggingface/kernels.git@55b7c980e96bf5f747f0e4136be61c0b089ab76c -Installed 14 packages in 3ms +Installed 14 packages in 4ms
Fetching 6 files: 0%| | 0/6 [00:00<?, ?it/s] -Fetching 6 files: 50%|█████ | 3/6 [00:00<00:00, 6.07it/s] -Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 12.14it/s]
+Fetching 6 files: 50%|█████ | 3/6 [00:00<00:00, 5.85it/s] +Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 11.70it/s]

Artifacts:

openai_moe.jsonl diff --git a/openai_moe/results/artifacts/combine/latency.svg b/openai_moe/results/artifacts/combine/latency.svg index 844816c8fa2d0ddbedc947c671ec11193a9ee2de..2df8cba08a055f61d0f2e1590c4a4110a6d62d74 100644 --- a/openai_moe/results/artifacts/combine/latency.svg +++ b/openai_moe/results/artifacts/combine/latency.svg @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3090485b23d0740dc54ec975ab4d53494c6243ac5b87df898966ffdc9bc67256 -size 20315 +oid sha256:02555fc2deb5b3ebf32c8e5fb2da2aa4a52c2220d481a1dd86dbee6b6edcbda6 +size 21867 diff --git a/openai_moe/results/combined_results.html b/openai_moe/results/combined_results.html index 91e18d57c1c70be55a0b5e7b775136eeab242e5e..45e6ffa799b9e58ab3b1842e4639066e2b9cdefa 100644 --- a/openai_moe/results/combined_results.html +++ b/openai_moe/results/combined_results.html @@ -3889,7 +3889,7 @@ body[data-tool="eraser"] .main-content { - 2025-12-19T19:55:39.293722 + 2025-12-19T23:02:40.893386 image/svg+xml @@ -3908,294 +3908,320 @@ body[data-tool="eraser"] .main-content { - + - + - + - cuda_B1_S512_E2 + cuda_B1_S512_E2 - + - + - cuda_B1_S512_E4 + cuda_B1_S512_E4 - + - + - cuda_B1_S1024_E2 + cuda_B1_S1024_E2 - + - + - cuda_B1_S1024_E4 + cuda_B1_S1024_E4 - + - + - cuda_B4_S512_E2 + cuda_B4_S512_E2 - + - + - cuda_B4_S512_E4 + cuda_B4_S512_E4 - + - + - cuda_B4_S1024_E2 + cuda_B4_S1024_E2 - + - + - cuda_B4_S1024_E4 + cuda_B4_S1024_E4 - Workload + Workload - + - + - 0 + 0 - + - + - 250 + 200 - + - + - 500 + 400 - + - + - 750 + 600 - + - + - 1000 + 800 - + - + - 1250 + 1000 - + - + - 1500 + 1200 + + + + + + + + + + + + + 1400 + + + + + + + + + + + + + 1600 - Latency P50 (ms) + Latency P50 (ms) - + - - - - - - - - - + + + + + + + + + - + - - - - - - - - - + + + + + + + + + - + - + - + - - Attention Implementation Latency + + Attention Implementation Latency - + - - + + - + - binned_torch + binned_torch - - + + - + - gpt_oss_experts + gpt_oss_experts - - + + @@ -4208,7 +4234,7 @@ body[data-tool="eraser"] .main-content { ▼ output ▶ uv-logs | -Cell: combine | 4.43s +Cell: combine | 4.45s | Raw @@ -4297,22 +4323,22 @@ Summary: 2 found, 0 skipped, 0 missing COMBINED BENCHMARK SUMMARY impl wl p50(ms) ok -binned_torch cuda_B1_S1024_E2 377.89 True -binned_torch cuda_B1_S1024_E4 408.91 True -binned_torch cuda_B1_S512_E2 158.27 True -binned_torch cuda_B1_S512_E4 209.01 True -binned_torch cuda_B4_S1024_E2 1516.51 True -binned_torch cuda_B4_S1024_E4 1643.14 True -binned_torch cuda_B4_S512_E2 769.64 True -binned_torch cuda_B4_S512_E4 816.95 True -gpt_oss_experts cuda_B1_S1024_E2 3.85 True -gpt_oss_experts cuda_B1_S1024_E4 5.31 True -gpt_oss_experts cuda_B1_S512_E2 2.63 True -gpt_oss_experts cuda_B1_S512_E4 3.93 True -gpt_oss_experts cuda_B4_S1024_E2 13.24 True -gpt_oss_experts cuda_B4_S1024_E4 13.36 True -gpt_oss_experts cuda_B4_S512_E2 6.72 True -gpt_oss_experts cuda_B4_S512_E4 7.52 True +binned_torch cuda_B1_S1024_E2 367.98 True +binned_torch cuda_B1_S1024_E4 396.30 True +binned_torch cuda_B1_S512_E2 154.35 True +binned_torch cuda_B1_S512_E4 195.55 True +binned_torch cuda_B4_S1024_E2 1510.09 True +binned_torch cuda_B4_S1024_E4 1618.05 True +binned_torch cuda_B4_S512_E2 733.47 True +binned_torch cuda_B4_S512_E4 787.61 True +gpt_oss_experts cuda_B1_S1024_E2 3.87 True +gpt_oss_experts cuda_B1_S1024_E4 5.34 True +gpt_oss_experts cuda_B1_S512_E2 2.66 True +gpt_oss_experts cuda_B1_S512_E4 3.95 True +gpt_oss_experts cuda_B4_S1024_E2 13.39 True +gpt_oss_experts cuda_B4_S1024_E4 13.41 True +gpt_oss_experts cuda_B4_S512_E2 6.80 True +gpt_oss_experts cuda_B4_S512_E4 7.53 True GENERATING COMBINED VISUALIZATION @@ -4332,7 +4358,7 @@ Implementations included:
▶ UV Install Logs
@@ -4345,7 +4371,7 @@ Installed 37 packages in 205ms - 2025-12-19T19:55:39.293722 + 2025-12-19T23:02:40.893386 image/svg+xml @@ -4364,294 +4390,320 @@ Installed 37 packages in 205ms - + - + - + - cuda_B1_S512_E2 + cuda_B1_S512_E2 - + - + - cuda_B1_S512_E4 + cuda_B1_S512_E4 - + - + - cuda_B1_S1024_E2 + cuda_B1_S1024_E2 - + - + - cuda_B1_S1024_E4 + cuda_B1_S1024_E4 - + - + - cuda_B4_S512_E2 + cuda_B4_S512_E2 - + - + - cuda_B4_S512_E4 + cuda_B4_S512_E4 - + - + - cuda_B4_S1024_E2 + cuda_B4_S1024_E2 - + - + - cuda_B4_S1024_E4 + cuda_B4_S1024_E4 - Workload + Workload - + - + - 0 + 0 - + - + - 250 + 200 - + - + - 500 + 400 - + - + - 750 + 600 - + - + - 1000 + 800 - + - + - 1250 + 1000 - + - + - 1500 + 1200 + + + + + + + + + + + + + 1400 + + + + + + + + + + + + + 1600 - Latency P50 (ms) + Latency P50 (ms) - + - - - - - - - - - + + + + + + + + + - + - - - - - - - - - + + + + + + + + + - + - + - + - - Attention Implementation Latency + + Attention Implementation Latency - + - - + + - + - binned_torch + binned_torch - - + + - + - gpt_oss_experts + gpt_oss_experts - - + + diff --git a/rotary/impls/artifacts/benchmark/rotary.jsonl b/rotary/impls/artifacts/benchmark/rotary.jsonl index 5f35e45084ea719a3ca2aeb5c55449feae48605d..7f067ba65986e0c388d4d39ef672b97d0dfd7f63 100644 --- a/rotary/impls/artifacts/benchmark/rotary.jsonl +++ b/rotary/impls/artifacts/benchmark/rotary.jsonl @@ -1,24 +1,24 @@ -{"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H8_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07445200003530772, "p50": 0.07589100005134242, "p90": 0.07600200001434132, "mean": 0.0754678000248532, "iqr": 0.0014600000213249587, "raw_times": [0.0764520000302582, 0.07600200001434132, 0.07589100005134242, 0.07454199999301636, 0.07445200003530772], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08018199991965957, "peak_bytes": 3178496, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.590452924915553e-08, "mae_k": 1.5487040982975486e-08, "mse_q": 2.5241010080938753e-15, "mse_k": 2.364223539299626e-15, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H8_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08898300006876525, "p50": 0.09174199999506527, "p90": 0.09300300007453188, "mean": 0.09168480000880663, "iqr": 0.0013200001376389991, "raw_times": [0.09168299993689288, 0.09300300007453188, 0.09174199999506527, 0.09301299996877788, 0.08898300006876525], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09485199984737847, "peak_bytes": 6356992, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5508486939097565e-08, "mae_k": 1.567566698668088e-08, "mse_q": 2.3630110116356316e-15, "mse_k": 2.416562128626943e-15, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H32_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08705300001565774, "p50": 0.09316300020145718, "p90": 0.10223200001746591, "mean": 0.09793460003493237, "iqr": 0.013889999991079094, "raw_times": [0.10223200001746591, 0.09316300020145718, 0.11888299991369422, 0.08834200002638681, 0.08705300001565774], "has_warnings": true, "reps": 5, "warmup": 2}, "compile_ms": 0.09152200004791666, "peak_bytes": 12615680, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5856898016863852e-08, "mae_k": 1.572981211950264e-08, "mse_q": 2.4771055025978386e-15, "mse_k": 2.4544071371937915e-15, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H32_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0873520000368444, "p50": 0.08926300006351084, "p90": 0.08946200000536919, "mean": 0.08885220004231087, "iqr": 0.0013999999737279722, "raw_times": [0.0873520000368444, 0.09012200007418869, 0.08926300006351084, 0.08806200003164122, 0.08946200000536919], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09113200007959676, "peak_bytes": 25231360, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5617658277733426e-08, "mae_k": 1.5788685914230882e-08, "mse_q": 2.4549424620164562e-15, "mse_k": 2.492823469483563e-15, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H8_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08857299985720601, "p50": 0.09020299989970226, "p90": 0.09035299990500789, "mean": 0.0900987999557401, "iqr": 0.00085999977272877, "raw_times": [0.08949300013227912, 0.09020299989970226, 0.09035299990500789, 0.08857299985720601, 0.09187199998450524], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09262200001103338, "peak_bytes": 12779520, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5962712041073246e-08, "mae_k": 1.5743363945830424e-08, "mse_q": 2.534145124782417e-15, "mse_k": 2.451281585618423e-15, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H8_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08818200012683519, "p50": 0.08903200000531797, "p90": 0.08924200005822058, "mean": 0.08891200000107347, "iqr": 0.0008400002116104588, "raw_times": [0.08903200000531797, 0.08840199984661012, 0.08818200012683519, 0.08924200005822058, 0.08970199996838346], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09153199994216266, "peak_bytes": 25427968, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.578730035589615e-08, "mae_k": 1.5859711766097462e-08, "mse_q": 2.440287521479536e-15, "mse_k": 2.477901290051784e-15, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H32_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08838300004754274, "p50": 0.08999199985737505, "p90": 0.0905619999684859, "mean": 0.09254639999198844, "iqr": 0.0011899999208253575, "raw_times": [0.0905619999684859, 0.10442300003887794, 0.08937200004766055, 0.08999199985737505, 0.08838300004754274], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.10543200005486142, "peak_bytes": 50462720, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5775295736375483e-08, "mae_k": 1.5847881229547056e-08, "mse_q": 2.471039476146077e-15, "mse_k": 2.472378635235686e-15, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H32_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08926200007408625, "p50": 0.08963200002654048, "p90": 0.09017300021696428, "mean": 0.08983220009213255, "iqr": 0.0005410001904238015, "raw_times": [0.08963200002654048, 0.09017300021696428, 0.08926200007408625, 0.08963200002654048, 0.09046200011653127], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09046300010595587, "peak_bytes": 100925440, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5959869870130206e-08, "mae_k": 1.588083975434529e-08, "mse_q": 2.510663677418633e-15, "mse_k": 2.502786271009168e-15, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H8_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08713199986232212, "p50": 0.08950200003710052, "p90": 0.08994299992082233, "mean": 0.08932219993766921, "iqr": 0.0012210000477352878, "raw_times": [0.08872199987308704, 0.08950200003710052, 0.08994299992082233, 0.09131199999501405, 0.08713199986232212], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09138199993685703, "peak_bytes": 51118080, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5894533689220225e-08, "mae_k": 1.5873395042831362e-08, "mse_q": 2.5093181655819197e-15, "mse_k": 2.488611809911578e-15, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H8_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08889199989425833, "p50": 0.09063199991032889, "p90": 0.09075200000552286, "mean": 0.09513419990980765, "iqr": 0.0011500001164677087, "raw_times": [0.08889199989425833, 0.08960199988905515, 0.09075200000552286, 0.09063199991032889, 0.11579299984987301], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09194200015372189, "peak_bytes": 101711872, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5936768349433805e-08, "mae_k": 1.5960043953100467e-08, "mse_q": 2.51039008577667e-15, "mse_k": 2.5111253103748867e-15, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H32_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08985299996311369, "p50": 0.09080199993150018, "p90": 0.09128199985752872, "mean": 0.09099019994209812, "iqr": 0.0010899998414970469, "raw_times": [0.08985299996311369, 0.09019200001603167, 0.09128199985752872, 0.09282199994231632, 0.09080199993150018], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09297199994762195, "peak_bytes": 201850880, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 1.9073486328125e-06, "absmax_k": 9.5367431640625e-07, "mae_q": 1.586510300910504e-08, "mae_k": 1.5813935050346117e-08, "mse_q": 2.499836478770355e-15, "mse_k": 2.4755639026338358e-15, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H32_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2603559998988203, "p50": 0.2619460001369589, "p90": 0.2620170000682265, "mean": 0.26208240001324157, "iqr": 0.00017100001059588976, "raw_times": [0.2619460001369589, 0.2603559998988203, 0.2618460000576306, 0.26424699990457157, 0.2620170000682265], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.26098600005752814, "peak_bytes": 403701760, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.581049247079136e-08, "mae_k": 1.5861061797295406e-08, "mse_q": 2.4735094242202705e-15, "mse_k": 2.486832828964107e-15, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H8_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0871929998993437, "p50": 0.08902200011107197, "p90": 0.08904199989956396, "mean": 0.08843839996188763, "iqr": 0.0015389998679893324, "raw_times": [0.08750300003157463, 0.08904199989956396, 0.08943199986788386, 0.08902200011107197, 0.0871929998993437], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09225299982063007, "peak_bytes": 137396224, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5824980437173508e-08, "mae_k": 1.5615324144846454e-08, "mse_q": 2.488090249374306e-15, "mse_k": 2.425079044911585e-15, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H8_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0884020000739838, "p50": 0.08922200004235492, "p90": 0.08970199996838346, "mean": 0.08935000005294569, "iqr": 0.0005199999577598646, "raw_times": [0.08922200004235492, 0.0884020000739838, 0.09024200016938266, 0.08970199996838346, 0.0891820000106236], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09462200000598386, "peak_bytes": 12648448, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5683587761827766e-08, "mae_k": 1.574532682013796e-08, "mse_q": 2.4310271220254415e-15, "mse_k": 2.4601385856313877e-15, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H32_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08808200004750688, "p50": 0.08919200013224327, "p90": 0.09035200014295697, "mean": 0.0894580000476708, "iqr": 0.0017100001059588976, "raw_times": [0.08808200004750688, 0.08919200013224327, 0.09035200014295697, 0.09102199987864878, 0.08864200003699807], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0948819999848638, "peak_bytes": 25198592, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5835009747888762e-08, "mae_k": 1.572560215379326e-08, "mse_q": 2.478222950813504e-15, "mse_k": 2.4541699679685603e-15, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H32_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08770199997343298, "p50": 0.08942199997363787, "p90": 0.08942299996306247, "mean": 0.08932019995882001, "iqr": 0.0003010000000358559, "raw_times": [0.08912199996302661, 0.09093199992094014, 0.08770199997343298, 0.08942299996306247, 0.08942199997363787], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09424199993190996, "peak_bytes": 50397184, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5876850056883995e-08, "mae_k": 1.5927410501603845e-08, "mse_q": 2.504224532953606e-15, "mse_k": 2.503892919554756e-15, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H8_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08902200011107197, "p50": 0.09058199998435157, "p90": 0.09125299993684166, "mean": 0.09215640002366854, "iqr": 0.0012609998520929366, "raw_times": [0.09993300000132876, 0.09058199998435157, 0.09125299993684166, 0.08902200011107197, 0.08999200008474872], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09323300014330016, "peak_bytes": 25362432, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5820052823301012e-08, "mae_k": 1.580205122309053e-08, "mse_q": 2.4876468276264184e-15, "mse_k": 2.4866062476507165e-15, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H8_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08776200002102996, "p50": 0.08892300002116826, "p90": 0.08966199993665214, "mean": 0.0888985999608849, "iqr": 0.001638999947317643, "raw_times": [0.09012299983623961, 0.08892300002116826, 0.08966199993665214, 0.08776200002102996, 0.0880229999893345], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09301299996877788, "peak_bytes": 50593792, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5823172105911e-08, "mae_k": 1.582038855474366e-08, "mse_q": 2.464257071579175e-15, "mse_k": 2.4775099608301526e-15, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H32_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0880819998201332, "p50": 0.08909200005291495, "p90": 0.08928200008995191, "mean": 0.08905000004233443, "iqr": 0.00023999996301427018, "raw_times": [0.0880819998201332, 0.08904200012693764, 0.08975200012173445, 0.08909200005291495, 0.08928200008995191], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09304200011683861, "peak_bytes": 100794368, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5888783622131086e-08, "mae_k": 1.5861886026868888e-08, "mse_q": 2.4766798685418433e-15, "mse_k": 2.475923891636419e-15, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H32_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08700200010025583, "p50": 0.08849200003169244, "p90": 0.0890119999894523, "mean": 0.08845600000313425, "iqr": 0.0007099999947968172, "raw_times": [0.0890119999894523, 0.08830199999465549, 0.08947199989961518, 0.08700200010025583, 0.08849200003169244], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09235300012733205, "peak_bytes": 201588736, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5826390864503992e-08, "mae_k": 1.5792682717119533e-08, "mse_q": 2.480465258783123e-15, "mse_k": 2.475580631534544e-15, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H8_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08904200012693764, "p50": 0.0900719999208377, "p90": 0.09035200014295697, "mean": 0.09022600006574066, "iqr": 0.0009600000794307562, "raw_times": [0.09035200014295697, 0.08904200012693764, 0.0922720000744448, 0.08939200006352621, 0.0900719999208377], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09144199998445401, "peak_bytes": 101449728, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.592899323554775e-08, "mae_k": 1.5925031959795888e-08, "mse_q": 2.50783882253954e-15, "mse_k": 2.5015648494992274e-15, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H8_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08612200008428772, "p50": 0.08916199999475793, "p90": 0.08966199993665214, "mean": 0.08842420002110885, "iqr": 0.002328999926248798, "raw_times": [0.08612200008428772, 0.0898420000794431, 0.08916199999475793, 0.08733300001040334, 0.08966199993665214], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09376299999530602, "peak_bytes": 202375168, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.590209919299923e-08, "mae_k": 1.590130160877834e-08, "mse_q": 2.4971026799330918e-15, "mse_k": 2.506967649153289e-15, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H32_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2547460001096624, "p50": 0.25804599999901257, "p90": 0.2586460000202351, "mean": 0.25757600001270475, "iqr": 0.0013200001376389991, "raw_times": [0.2547460001096624, 0.2591160000520176, 0.25804599999901257, 0.2586460000202351, 0.2573259998825961], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.25434600001972285, "peak_bytes": 403177472, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5847520629108658e-08, "mae_k": 1.5862454461057496e-08, "mse_q": 2.4917348203881045e-15, "mse_k": 2.491306009958557e-15, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H32_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.8454199999050616, "p50": 0.8495600000060222, "p90": 0.8538209999642277, "mean": 0.8503745999860257, "iqr": 0.0067099999796482734, "raw_times": [0.8559610000702378, 0.8538209999642277, 0.8471109999845794, 0.8495600000060222, 0.8454199999050616], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.8642910001981363, "peak_bytes": 806354944, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.585225106737198e-08, "mae_k": 1.581303976649906e-08, "mse_q": 2.4866460581992374e-15, "mse_k": 2.4721545950211372e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H8_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0716920001195831, "p50": 0.07250199996633455, "p90": 0.07283199988705746, "mean": 0.07240760000968294, "iqr": 0.0009109999155043624, "raw_times": [0.07250199996633455, 0.07283199988705746, 0.0716920001195831, 0.0719209999715531, 0.07309100010388647], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08089199991445639, "peak_bytes": 3178496, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.590452924915553e-08, "mae_k": 1.5487040982975486e-08, "mse_q": 2.5241010080938753e-15, "mse_k": 2.364223539299626e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H8_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08928200008995191, "p50": 0.08997100007945846, "p90": 0.0909519999368058, "mean": 0.09690580000096816, "iqr": 0.001269999984288006, "raw_times": [0.08928200008995191, 0.08997100007945846, 0.0896819999525178, 0.0909519999368058, 0.12464199994610681], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.10248300009152445, "peak_bytes": 6356992, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5508486939097565e-08, "mae_k": 1.567566698668088e-08, "mse_q": 2.3630110116356316e-15, "mse_k": 2.416562128626943e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H32_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08800199998404423, "p50": 0.08953199994721217, "p90": 0.09005199990497204, "mean": 0.08934599995882309, "iqr": 0.0015899997833912494, "raw_times": [0.08800199998404423, 0.0906819998363062, 0.08953199994721217, 0.08846200012158079, 0.09005199990497204], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08988199988380075, "peak_bytes": 12615680, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5856898016863852e-08, "mae_k": 1.572981211950264e-08, "mse_q": 2.4771055025978386e-15, "mse_k": 2.4544071371937915e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H32_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08559200000490819, "p50": 0.08698200008439017, "p90": 0.08820199991532718, "mean": 0.08761399999457353, "iqr": 0.0018399998680251883, "raw_times": [0.08820199991532718, 0.08698200008439017, 0.09093199992094014, 0.08559200000490819, 0.08636200004730199], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09196199994221388, "peak_bytes": 25231360, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5617658277733426e-08, "mae_k": 1.5788685914230882e-08, "mse_q": 2.4549424620164562e-15, "mse_k": 2.492823469483563e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H8_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0869620000685245, "p50": 0.08914199997889227, "p90": 0.08982199983620376, "mean": 0.08910199994716095, "iqr": 0.0019899998733308166, "raw_times": [0.08783199996287294, 0.08914199997889227, 0.08982199983620376, 0.0869620000685245, 0.09175199988931126], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08990199989966641, "peak_bytes": 12779520, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5962712041073246e-08, "mae_k": 1.5743363945830424e-08, "mse_q": 2.534145124782417e-15, "mse_k": 2.451281585618423e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H8_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08547199990971421, "p50": 0.08752200005801569, "p90": 0.08800199998404423, "mean": 0.08713399993212079, "iqr": 0.002420000100755715, "raw_times": [0.08558199988328852, 0.08909199982554128, 0.08752200005801569, 0.08547199990971421, 0.08800199998404423], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0911110000743065, "peak_bytes": 25427968, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.578730035589615e-08, "mae_k": 1.5859711766097462e-08, "mse_q": 2.440287521479536e-15, "mse_k": 2.477901290051784e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H32_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08664199981467391, "p50": 0.0881319999734842, "p90": 0.08973199987849512, "mean": 0.08822799991321517, "iqr": 0.0029299999368959107, "raw_times": [0.08664199981467391, 0.08983199995782343, 0.0881319999734842, 0.0868019999415992, 0.08973199987849512], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08937200004766055, "peak_bytes": 50462720, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5775295736375483e-08, "mae_k": 1.5847881229547056e-08, "mse_q": 2.471039476146077e-15, "mse_k": 2.472378635235686e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H32_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08743200010030705, "p50": 0.08830199999465549, "p90": 0.08903200000531797, "mean": 0.08846400005495525, "iqr": 0.0009099999260797631, "raw_times": [0.08830199999465549, 0.08903200000531797, 0.08943200009525754, 0.0881220000792382, 0.08743200010030705], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09105200001613412, "peak_bytes": 100925440, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5959869870130206e-08, "mae_k": 1.588083975434529e-08, "mse_q": 2.510663677418633e-15, "mse_k": 2.502786271009168e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H8_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08645200000501063, "p50": 0.08729199998924742, "p90": 0.08832200001052115, "mean": 0.08765380002841994, "iqr": 0.0011610000001383014, "raw_times": [0.08645200000501063, 0.08904200012693764, 0.08832200001052115, 0.08716100001038285, 0.08729199998924742], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08923200016397459, "peak_bytes": 51118080, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5894533689220225e-08, "mae_k": 1.5873395042831362e-08, "mse_q": 2.5093181655819197e-15, "mse_k": 2.488611809911578e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H8_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08737200005271006, "p50": 0.08886199998414668, "p90": 0.08923200016397459, "mean": 0.08870620004017837, "iqr": 0.0006990001111262245, "raw_times": [0.08737200005271006, 0.08953199994721217, 0.08886199998414668, 0.08853300005284837, 0.08923200016397459], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08981199994195777, "peak_bytes": 101711872, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5936768349433805e-08, "mae_k": 1.5960043953100467e-08, "mse_q": 2.51039008577667e-15, "mse_k": 2.5111253103748867e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H32_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08698100009496557, "p50": 0.0875120001637697, "p90": 0.08985099998426449, "mean": 0.08861960000103863, "iqr": 0.0027790001695393585, "raw_times": [0.0875120001637697, 0.08985099998426449, 0.08698100009496557, 0.09168199994746828, 0.08707199981472513], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09252200015907874, "peak_bytes": 201850880, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 1.9073486328125e-06, "absmax_k": 9.5367431640625e-07, "mae_q": 1.586510300910504e-08, "mae_k": 1.5813935050346117e-08, "mse_q": 2.499836478770355e-15, "mse_k": 2.4755639026338358e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H32_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2610050000839692, "p50": 0.261626000110482, "p90": 0.26182599981439125, "mean": 0.261735599997337, "iqr": 0.0002109998149535386, "raw_times": [0.2616149999994377, 0.2610050000839692, 0.261626000110482, 0.26182599981439125, 0.2626059999784047], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.26071599995702854, "peak_bytes": 403701760, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.581049247079136e-08, "mae_k": 1.5861061797295406e-08, "mse_q": 2.4735094242202705e-15, "mse_k": 2.486832828964107e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H8_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08662100003675732, "p50": 0.08793199981482758, "p90": 0.08872200010046072, "mean": 0.08787560000200756, "iqr": 0.0014409999948838959, "raw_times": [0.08662100003675732, 0.08872200010046072, 0.08882199995241535, 0.08793199981482758, 0.08728100010557682], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0905719998627319, "peak_bytes": 137396224, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5824980437173508e-08, "mae_k": 1.5615324144846454e-08, "mse_q": 2.488090249374306e-15, "mse_k": 2.425079044911585e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H8_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08488200001011137, "p50": 0.08702200011612149, "p90": 0.08860200000526675, "mean": 0.08870799997566792, "iqr": 0.002460000132487039, "raw_times": [0.08488200001011137, 0.08702200011612149, 0.08860200000526675, 0.0861419998727797, 0.09689199987406028], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09050199992088892, "peak_bytes": 12648448, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5683587761827766e-08, "mae_k": 1.574532682013796e-08, "mse_q": 2.4310271220254415e-15, "mse_k": 2.4601385856313877e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H32_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08632200001557067, "p50": 0.08908199993129529, "p90": 0.08911199984140694, "mean": 0.08852599994497723, "iqr": 0.0001399998836859595, "raw_times": [0.08632200001557067, 0.08914199997889227, 0.08911199984140694, 0.08908199993129529, 0.08897199995772098], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09170199996333395, "peak_bytes": 25198592, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5835009747888762e-08, "mae_k": 1.572560215379326e-08, "mse_q": 2.478222950813504e-15, "mse_k": 2.4541699679685603e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H32_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08825200006867817, "p50": 0.08947200012698886, "p90": 0.08949199991548085, "mean": 0.08924200005822058, "iqr": 0.00026999987312592566, "raw_times": [0.08825200006867817, 0.08977200013760012, 0.08949199991548085, 0.08947200012698886, 0.08922200004235492], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09157099998446938, "peak_bytes": 50397184, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5876850056883995e-08, "mae_k": 1.5927410501603845e-08, "mse_q": 2.504224532953606e-15, "mse_k": 2.503892919554756e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H8_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08679200004735321, "p50": 0.08709200005796447, "p90": 0.08838100006869354, "mean": 0.08772180003688845, "iqr": 0.0012890000107290689, "raw_times": [0.08709200005796447, 0.08679200004735321, 0.08925199995246658, 0.08709200005796447, 0.08838100006869354], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09222199992109381, "peak_bytes": 25362432, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5820052823301012e-08, "mae_k": 1.580205122309053e-08, "mse_q": 2.4876468276264184e-15, "mse_k": 2.4866062476507165e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H8_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08846200012158079, "p50": 0.08976200001598045, "p90": 0.09003199988910637, "mean": 0.0897020000138582, "iqr": 0.00085999977272877, "raw_times": [0.08846200012158079, 0.09003199988910637, 0.08976200001598045, 0.0891720001163776, 0.09108199992624577], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0899420001587714, "peak_bytes": 50593792, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5823172105911e-08, "mae_k": 1.582038855474366e-08, "mse_q": 2.464257071579175e-15, "mse_k": 2.4775099608301526e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H32_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08725199995751609, "p50": 0.08807200015326089, "p90": 0.08955199996307783, "mean": 0.08866800003488606, "iqr": 0.0017499999103165464, "raw_times": [0.08780200005276129, 0.08807200015326089, 0.08955199996307783, 0.08725199995751609, 0.09066200004781422], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.10307200000170269, "peak_bytes": 100794368, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5888783622131086e-08, "mae_k": 1.5861886026868888e-08, "mse_q": 2.4766798685418433e-15, "mse_k": 2.475923891636419e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H32_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0861920000261307, "p50": 0.08893199992598966, "p90": 0.08899199997358664, "mean": 0.08807199997136195, "iqr": 0.0017600000319362152, "raw_times": [0.0861920000261307, 0.08723199994165043, 0.08893199992598966, 0.0890119999894523, 0.08899199997358664], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09088199999496283, "peak_bytes": 201588736, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5826390864503992e-08, "mae_k": 1.5792682717119533e-08, "mse_q": 2.480465258783123e-15, "mse_k": 2.475580631534544e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H8_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08800199998404423, "p50": 0.08883200007403502, "p90": 0.08927299995775684, "mean": 0.08933019998949021, "iqr": 0.0010710000424296595, "raw_times": [0.08800199998404423, 0.08883200007403502, 0.09234200001628778, 0.08820199991532718, 0.08927299995775684], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09017200000016601, "peak_bytes": 101449728, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.592899323554775e-08, "mae_k": 1.5925031959795888e-08, "mse_q": 2.50783882253954e-15, "mse_k": 2.5015648494992274e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H8_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08575199990445981, "p50": 0.08742199997868738, "p90": 0.08754200007388135, "mean": 0.08730620002097567, "iqr": 0.00023999996301427018, "raw_times": [0.08754200007388135, 0.08730200011086708, 0.08575199990445981, 0.0885130000369827, 0.08742199997868738], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0908419999632315, "peak_bytes": 202375168, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.590209919299923e-08, "mae_k": 1.590130160877834e-08, "mse_q": 2.4971026799330918e-15, "mse_k": 2.506967649153289e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H32_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.25617599999350205, "p50": 0.2583450000201992, "p90": 0.2584750000096392, "mean": 0.2583615999810718, "iqr": 0.0005590000000665896, "raw_times": [0.2579160000095726, 0.2583450000201992, 0.25617599999350205, 0.2608959998724458, 0.2584750000096392], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2549850000832521, "peak_bytes": 403177472, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5847520629108658e-08, "mae_k": 1.5862454461057496e-08, "mse_q": 2.4917348203881045e-15, "mse_k": 2.491306009958557e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H32_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.8457680000901746, "p50": 0.8511980001912889, "p90": 0.8513080001648632, "mean": 0.8505584000886302, "iqr": 0.003619000153776142, "raw_times": [0.8513080001648632, 0.8457680000901746, 0.847689000011087, 0.8511980001912889, 0.856828999985737], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.8516880000115634, "peak_bytes": 806354944, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.585225106737198e-08, "mae_k": 1.581303976649906e-08, "mse_q": 2.4866460581992374e-15, "mse_k": 2.4721545950211372e-15, "ref": "rotary_torch"}, "err": null} diff --git a/rotary/impls/hf_kernels_rotary.html b/rotary/impls/hf_kernels_rotary.html index adcc9d626cb47c98a66cb3d389a2f1ee669bd78f..a68e01a35c6f4c875c640a62c499740072333d73 100644 --- a/rotary/impls/hf_kernels_rotary.html +++ b/rotary/impls/hf_kernels_rotary.html @@ -3905,7 +3905,7 @@ Cell: nv | 0.25s
-
Fri Dec 19 19:54:55 2025       
+
Fri Dec 19 23:00:45 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 580.105.08             Driver Version: 580.105.08     CUDA Version: 13.0     |
 +-----------------------------------------+------------------------+----------------------+
@@ -3914,7 +3914,7 @@ Cell: nv | 0.25s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   36C    P0             90W /  350W |       0MiB /  46068MiB |     17%      Default |
+| N/A   40C    P0             85W /  350W |       0MiB /  46068MiB |     24%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -3938,7 +3938,7 @@ Cell: nv | 0.25s
 ▼ output
  ▶ uv-logs
  | 
-Cell: benchmark | 6.21s
+Cell: benchmark | 4.78s
  | 
 
 Raw
@@ -4009,23 +4009,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     397.918us      1712.80%     397.918us     397.918us             1  
-                                      hf_kernels_rotary        10.09%     234.566us        99.43%       2.310ms       2.310ms       0.000us         0.00%      24.512us      24.512us             1  
-                          _rotary_dba7d1e::apply_rotary         2.29%      53.121us         4.37%     101.432us      16.905us      16.192us        69.70%      16.192us       2.699us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      16.192us        69.70%      16.192us       2.699us             6  
-                                            aten::clone         1.67%      38.889us        82.80%       1.924ms     320.673us       0.000us         0.00%       8.320us       1.387us             6  
-                                            aten::copy_         1.71%      39.649us        78.85%       1.832ms     305.366us       7.040us        30.30%       8.320us       1.387us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.040us        30.30%       7.040us       1.173us             6  
-                                Activity Buffer Request        74.05%       1.721ms        74.05%       1.721ms       1.721ms       1.280us         5.51%       1.280us       1.280us             1  
-                                    aten::empty_strided         2.28%      52.952us         2.28%      52.952us       8.825us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         3.09%      71.834us         3.09%      71.834us      11.972us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.69%      39.231us         2.17%      50.432us       4.203us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.48%      11.201us         0.48%      11.201us       0.933us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         2.08%      48.311us         2.08%      48.311us       8.052us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.57%      13.240us         0.57%      13.240us      13.240us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     384.577us      1653.10%     384.577us     384.577us             1  
+                                      hf_kernels_rotary         9.70%     223.937us        99.35%       2.294ms       2.294ms       0.000us         0.00%      24.544us      24.544us             1  
+                          _rotary_dba7d1e::apply_rotary         2.33%      53.870us         4.38%     101.111us      16.852us      16.128us        69.33%      16.128us       2.688us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      16.128us        69.33%      16.128us       2.688us             6  
+                                            aten::clone         1.57%      36.153us        83.10%       1.919ms     319.785us       0.000us         0.00%       8.416us       1.403us             6  
+                                            aten::copy_         1.64%      37.980us        79.42%       1.834ms     305.621us       7.136us        30.67%       8.416us       1.403us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.136us        30.67%       7.136us       1.189us             6  
+                                Activity Buffer Request        74.69%       1.725ms        74.69%       1.725ms       1.725ms       1.280us         5.50%       1.280us       1.280us             1  
+                                    aten::empty_strided         2.11%      48.830us         2.11%      48.830us       8.138us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         3.08%      71.171us         3.08%      71.171us      11.862us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.71%      39.468us         2.17%      50.120us       4.177us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.46%      10.652us         0.46%      10.652us       0.888us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.05%      47.241us         2.05%      47.241us       7.873us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.65%      15.020us         0.65%      15.020us      15.020us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.324ms
-Self CUDA time total: 23.232us
+Self CPU time total: 2.309ms
+Self CUDA time total: 23.264us
 
 
 
@@ -4035,23 +4035,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     351.006us      1460.58%     351.006us     351.006us             1  
-                                      hf_kernels_rotary         8.50%     188.914us        99.67%       2.215ms       2.215ms       0.000us         0.00%      25.344us      25.344us             1  
-                          _rotary_dba7d1e::apply_rotary         1.92%      42.653us         3.86%      85.804us      14.301us      16.160us        67.24%      16.160us       2.693us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      16.160us        67.24%      16.160us       2.693us             6  
-                                            aten::clone         1.34%      29.789us        85.47%       1.899ms     316.566us       0.000us         0.00%       9.184us       1.531us             6  
-                                            aten::copy_         1.59%      35.393us        82.63%       1.836ms     306.029us       7.872us        32.76%       9.184us       1.531us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.872us        32.76%       7.872us       1.312us             6  
-                                Activity Buffer Request        78.48%       1.744ms        78.48%       1.744ms       1.744ms       1.312us         5.46%       1.312us       1.312us             1  
-                                    aten::empty_strided         1.50%      33.432us         1.50%      33.432us       5.572us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         2.55%      56.710us         2.55%      56.710us       9.452us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.45%      32.280us         1.84%      40.820us       3.402us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.38%       8.540us         0.38%       8.540us       0.712us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         1.94%      43.151us         1.94%      43.151us       7.192us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.33%       7.360us         0.33%       7.360us       7.360us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     323.742us      1343.66%     323.742us     323.742us             1  
+                                      hf_kernels_rotary         8.02%     171.501us        99.72%       2.131ms       2.131ms       0.000us         0.00%      25.406us      25.406us             1  
+                          _rotary_dba7d1e::apply_rotary         1.84%      39.272us         3.69%      78.893us      13.149us      16.224us        67.34%      16.224us       2.704us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      16.224us        67.34%      16.224us       2.704us             6  
+                                            aten::clone         1.20%      25.714us        86.18%       1.842ms     306.982us       0.000us         0.00%       9.182us       1.530us             6  
+                                            aten::copy_         1.70%      36.249us        83.58%       1.786ms     297.695us       7.870us        32.66%       9.182us       1.530us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.870us        32.66%       7.870us       1.312us             6  
+                                Activity Buffer Request        79.35%       1.696ms        79.35%       1.696ms       1.696ms       1.312us         5.45%       1.312us       1.312us             1  
+                                    aten::empty_strided         1.40%      30.010us         1.40%      30.010us       5.002us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         2.53%      54.011us         2.53%      54.011us       9.002us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.43%      30.532us         1.82%      38.871us       3.239us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.39%       8.339us         0.39%       8.339us       0.695us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         1.85%      39.621us         1.85%      39.621us       6.603us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.28%       6.040us         0.28%       6.040us       6.040us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.222ms
-Self CUDA time total: 24.032us
+Self CPU time total: 2.137ms
+Self CUDA time total: 24.094us
 
 
 
@@ -4061,23 +4061,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     328.861us      1359.32%     328.861us     328.861us             1  
-                                      hf_kernels_rotary         7.82%     169.265us        99.74%       2.160ms       2.160ms       0.000us         0.00%      25.505us      25.505us             1  
-                          _rotary_dba7d1e::apply_rotary         1.90%      41.240us         3.83%      83.032us      13.839us      16.449us        67.99%      16.449us       2.742us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      16.449us        67.99%      16.449us       2.742us             6  
-                                            aten::clone         1.22%      26.522us        86.28%       1.868ms     311.384us       0.000us         0.00%       9.056us       1.509us             6  
-                                            aten::copy_         1.60%      34.652us        83.63%       1.811ms     301.836us       7.744us        32.01%       9.056us       1.509us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.744us        32.01%       7.744us       1.291us             6  
-                                Activity Buffer Request        79.55%       1.723ms        79.55%       1.723ms       1.723ms       1.312us         5.42%       1.312us       1.312us             1  
-                                    aten::empty_strided         1.42%      30.770us         1.42%      30.770us       5.128us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         2.49%      53.840us         2.49%      53.840us       8.973us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.40%      30.337us         1.81%      39.289us       3.274us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.41%       8.952us         0.41%       8.952us       0.746us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         1.93%      41.792us         1.93%      41.792us       6.965us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.26%       5.540us         0.26%       5.540us       5.540us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     324.959us      1341.36%     324.959us     324.959us             1  
+                                      hf_kernels_rotary         7.79%     167.945us        99.74%       2.150ms       2.150ms       0.000us         0.00%      25.506us      25.506us             1  
+                          _rotary_dba7d1e::apply_rotary         1.90%      41.040us         3.78%      81.541us      13.590us      16.482us        68.03%      16.482us       2.747us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      16.482us        68.03%      16.482us       2.747us             6  
+                                            aten::clone         1.17%      25.161us        86.39%       1.862ms     310.280us       0.000us         0.00%       9.024us       1.504us             6  
+                                            aten::copy_         1.63%      35.122us        83.86%       1.807ms     301.213us       7.744us        31.97%       9.024us       1.504us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.744us        31.97%       7.744us       1.291us             6  
+                                Activity Buffer Request        79.59%       1.715ms        79.59%       1.715ms       1.715ms       1.280us         5.28%       1.280us       1.280us             1  
+                                    aten::empty_strided         1.36%      29.241us         1.36%      29.241us       4.874us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         2.64%      56.970us         2.64%      56.970us       9.495us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.41%      30.477us         1.78%      38.419us       3.202us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.37%       7.942us         0.37%       7.942us       0.662us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         1.88%      40.501us         1.88%      40.501us       6.750us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.26%       5.501us         0.26%       5.501us       5.501us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.165ms
-Self CUDA time total: 24.193us
+Self CPU time total: 2.155ms
+Self CUDA time total: 24.226us
 
 
 
@@ -4087,23 +4087,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     332.000us      1184.36%     332.000us     332.000us             1  
-                                      hf_kernels_rotary         7.19%     171.403us        99.79%       2.378ms       2.378ms       0.000us         0.00%      29.792us      29.792us             1  
-                          _rotary_dba7d1e::apply_rotary         1.72%      40.922us         3.47%      82.793us      13.799us      17.632us        62.90%      17.632us       2.939us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      17.632us        62.90%      17.632us       2.939us             6  
-                                            aten::clone         1.16%      27.640us        87.43%       2.084ms     347.303us       0.000us         0.00%      12.160us       2.027us             6  
-                                            aten::copy_         1.42%      33.951us        84.95%       2.025ms     337.488us      10.400us        37.10%      12.160us       2.027us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.400us        37.10%      10.400us       1.733us             6  
-                                Activity Buffer Request        73.90%       1.761ms        73.90%       1.761ms       1.761ms       1.760us         6.28%       1.760us       1.760us             1  
-                                    aten::empty_strided         1.31%      31.250us         1.31%      31.250us       5.208us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         9.63%     229.586us         9.63%     229.586us      38.264us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.31%      31.193us         1.70%      40.482us       3.373us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.39%       9.289us         0.39%       9.289us       0.774us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         1.76%      41.871us         1.76%      41.871us       6.979us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.21%       5.050us         0.21%       5.050us       5.050us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     364.384us      1292.46%     364.384us     364.384us             1  
+                                      hf_kernels_rotary         8.38%     203.415us        99.78%       2.422ms       2.422ms       0.000us         0.00%      29.986us      29.986us             1  
+                          _rotary_dba7d1e::apply_rotary         1.78%      43.242us         3.59%      87.183us      14.530us      17.664us        62.65%      17.664us       2.944us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      17.664us        62.65%      17.664us       2.944us             6  
+                                            aten::clone         1.06%      25.710us        86.14%       2.091ms     348.456us       0.000us         0.00%      12.322us       2.054us             6  
+                                            aten::copy_         1.40%      33.961us        83.85%       2.035ms     339.187us      10.529us        37.35%      12.322us       2.054us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.529us        37.35%      10.529us       1.755us             6  
+                                Activity Buffer Request        73.02%       1.772ms        73.02%       1.772ms       1.772ms       1.793us         6.36%       1.793us       1.793us             1  
+                                    aten::empty_strided         1.23%      29.901us         1.23%      29.901us       4.983us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         9.43%     228.855us         9.43%     228.855us      38.143us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.32%      32.031us         1.67%      40.620us       3.385us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.35%       8.589us         0.35%       8.589us       0.716us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         1.81%      43.941us         1.81%      43.941us       7.323us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.22%       5.260us         0.22%       5.260us       5.260us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.384ms
-Self CUDA time total: 28.032us
+Self CPU time total: 2.427ms
+Self CUDA time total: 28.193us
 
 
 
@@ -4113,23 +4113,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     335.418us      1384.59%     335.418us     335.418us             1  
-                                      hf_kernels_rotary        20.26%     170.294us        99.34%     834.940us     834.940us       0.000us         0.00%      25.537us      25.537us             1  
-                          _rotary_dba7d1e::apply_rotary         4.83%      40.562us         9.92%      83.412us      13.902us      16.513us        68.17%      16.513us       2.752us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      16.513us        68.17%      16.513us       2.752us             6  
-                                            aten::clone         2.64%      22.222us        64.45%     541.674us      90.279us       0.000us         0.00%       9.024us       1.504us             6  
-                                            aten::copy_         4.18%      35.111us        57.94%     486.972us      81.162us       7.712us        31.83%       9.024us       1.504us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.712us        31.83%       7.712us       1.285us             6  
-                                Activity Buffer Request        27.90%     234.506us        27.90%     234.506us     234.506us       1.312us         5.42%       1.312us       1.312us             1  
-                                    aten::empty_strided         3.86%      32.480us         3.86%      32.480us       5.413us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        25.86%     217.355us        25.86%     217.355us      36.226us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.66%      30.769us         4.71%      39.560us       3.297us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.05%       8.791us         1.05%       8.791us       0.733us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         5.10%      42.850us         5.10%      42.850us       7.142us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.66%       5.560us         0.66%       5.560us       5.560us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     323.517us      1337.29%     323.517us     323.517us             1  
+                                      hf_kernels_rotary        20.05%     169.673us        99.33%     840.688us     840.688us       0.000us         0.00%      25.472us      25.472us             1  
+                          _rotary_dba7d1e::apply_rotary         4.98%      42.181us         9.92%      83.942us      13.990us      16.480us        68.12%      16.480us       2.747us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      16.480us        68.12%      16.480us       2.747us             6  
+                                            aten::clone         2.34%      19.800us        64.76%     548.062us      91.344us       0.000us         0.00%       8.992us       1.499us             6  
+                                            aten::copy_         3.95%      33.463us        59.03%     499.612us      83.269us       7.712us        31.88%       8.992us       1.499us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.712us        31.88%       7.712us       1.285us             6  
+                                Activity Buffer Request        28.63%     242.305us        28.63%     242.305us     242.305us       1.280us         5.29%       1.280us       1.280us             1  
+                                    aten::empty_strided         3.39%      28.650us         3.39%      28.650us       4.775us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        26.45%     223.844us        26.45%     223.844us      37.307us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.63%      30.762us         4.61%      39.011us       3.251us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.97%       8.249us         0.97%       8.249us       0.687us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         4.93%      41.761us         4.93%      41.761us       6.960us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.67%       5.650us         0.67%       5.650us       5.650us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 840.500us
-Self CUDA time total: 24.225us
+Self CPU time total: 846.338us
+Self CUDA time total: 24.192us
 
 
 
@@ -4139,23 +4139,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     335.419us      1187.11%     335.419us     335.419us             1  
-                                      hf_kernels_rotary        22.11%     159.484us        99.14%     715.038us     715.038us       0.000us         0.00%      30.047us      30.047us             1  
-                          _rotary_dba7d1e::apply_rotary         6.01%      43.310us        12.22%      88.120us      14.687us      17.727us        62.74%      17.727us       2.954us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      17.727us        62.74%      17.727us       2.954us             6  
-                                            aten::clone         2.86%      20.618us        59.23%     427.160us      71.193us       0.000us         0.00%      12.320us       2.053us             6  
-                                            aten::copy_         4.70%      33.871us        52.15%     376.129us      62.688us      10.528us        37.26%      12.320us       2.053us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.528us        37.26%      10.528us       1.755us             6  
-                                Activity Buffer Request        17.96%     129.563us        17.96%     129.563us     129.563us       1.792us         6.34%       1.792us       1.792us             1  
-                                    aten::empty_strided         4.22%      30.413us         4.22%      30.413us       5.069us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        29.49%     212.695us        29.49%     212.695us      35.449us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.36%      31.443us         5.58%      40.274us       3.356us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.22%       8.831us         1.22%       8.831us       0.736us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         6.21%      44.810us         6.21%      44.810us       7.468us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.86%       6.181us         0.86%       6.181us       6.181us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     311.513us      1102.47%     311.513us     311.513us             1  
+                                      hf_kernels_rotary        20.61%     164.155us        99.30%     791.098us     791.098us       0.000us         0.00%      30.048us      30.048us             1  
+                          _rotary_dba7d1e::apply_rotary         4.94%      39.349us         9.84%      78.361us      13.060us      17.696us        62.63%      17.696us       2.949us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      17.696us        62.63%      17.696us       2.949us             6  
+                                            aten::clone         2.47%      19.670us        64.03%     510.081us      85.014us       0.000us         0.00%      12.352us       2.059us             6  
+                                            aten::copy_         4.07%      32.461us        57.94%     461.581us      76.930us      10.560us        37.37%      12.352us       2.059us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.560us        37.37%      10.560us       1.760us             6  
+                                Activity Buffer Request        27.37%     218.044us        27.37%     218.044us     218.044us       1.792us         6.34%       1.792us       1.792us             1  
+                                    aten::empty_strided         3.62%      28.830us         3.62%      28.830us       4.805us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        26.50%     211.076us        26.50%     211.076us      35.179us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.85%      30.651us         4.83%      38.501us       3.208us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.99%       7.850us         0.99%       7.850us       0.654us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         4.90%      39.012us         4.90%      39.012us       6.502us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.70%       5.550us         0.70%       5.550us       5.550us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 721.219us
-Self CUDA time total: 28.255us
+Self CPU time total: 796.648us
+Self CUDA time total: 28.256us
 
 
 
@@ -4165,23 +4165,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     339.104us       833.12%     339.104us     339.104us             1  
-                                      hf_kernels_rotary         7.67%     177.694us        99.76%       2.310ms       2.310ms       0.000us         0.00%      43.583us      43.583us             1  
-                          _rotary_dba7d1e::apply_rotary         1.80%      41.651us         3.62%      83.803us      13.967us      23.520us        57.78%      23.520us       3.920us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      23.520us        57.78%      23.520us       3.920us             6  
-                                            aten::clone         1.20%      27.761us        86.70%       2.008ms     334.588us       0.000us         0.00%      20.063us       3.344us             6  
-                                            aten::copy_         1.49%      34.550us        84.17%       1.949ms     324.808us      17.183us        42.22%      20.063us       3.344us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.183us        42.22%      17.183us       2.864us             6  
-                                Activity Buffer Request        73.48%       1.701ms        73.48%       1.701ms       1.701ms       2.880us         7.08%       2.880us       2.880us             1  
-                                    aten::empty_strided         1.34%      30.920us         1.34%      30.920us       5.153us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         9.20%     212.976us         9.20%     212.976us      35.496us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.39%      32.120us         1.76%      40.721us       3.393us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.37%       8.601us         0.37%       8.601us       0.717us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         1.82%      42.152us         1.82%      42.152us       7.025us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.24%       5.660us         0.24%       5.660us       5.660us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     323.712us       798.42%     323.712us     323.712us             1  
+                                      hf_kernels_rotary         7.57%     170.665us        99.78%       2.250ms       2.250ms       0.000us         0.00%      43.392us      43.392us             1  
+                          _rotary_dba7d1e::apply_rotary         1.73%      38.911us         3.48%      78.541us      13.090us      23.456us        57.85%      23.456us       3.909us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      23.456us        57.85%      23.456us       3.909us             6  
+                                            aten::clone         1.16%      26.090us        87.02%       1.962ms     327.007us       0.000us         0.00%      19.936us       3.323us             6  
+                                            aten::copy_         1.56%      35.131us        84.49%       1.905ms     317.482us      17.088us        42.15%      19.936us       3.323us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.088us        42.15%      17.088us       2.848us             6  
+                                Activity Buffer Request        74.36%       1.676ms        74.36%       1.676ms       1.676ms       2.848us         7.02%       2.848us       2.848us             1  
+                                    aten::empty_strided         1.38%      31.061us         1.38%      31.061us       5.177us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         8.57%     193.284us         8.57%     193.284us      32.214us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.34%      30.268us         1.71%      38.460us       3.205us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.36%       8.192us         0.36%       8.192us       0.683us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         1.76%      39.630us         1.76%      39.630us       6.605us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.22%       4.960us         0.22%       4.960us       4.960us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.315ms
-Self CUDA time total: 40.703us
+Self CPU time total: 2.255ms
+Self CUDA time total: 40.544us
 
 
 
@@ -4191,23 +4191,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     342.239us       459.79%     342.239us     342.239us             1  
-                                      hf_kernels_rotary         7.38%     176.732us        99.78%       2.390ms       2.390ms       0.000us         0.00%      82.913us      82.913us             1  
-                                            aten::clone         1.17%      28.130us        87.22%       2.089ms     348.177us       0.000us         0.00%      43.777us       7.296us             6  
-                                            aten::copy_         1.46%      34.971us        84.72%       2.029ms     338.203us      35.297us        47.42%      43.777us       7.296us             6  
-                          _rotary_dba7d1e::apply_rotary         1.71%      40.931us         3.50%      83.864us      13.977us      39.136us        52.58%      39.136us       6.523us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      39.136us        52.58%      39.136us       6.523us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      35.297us        47.42%      35.297us       5.883us             6  
-                                Activity Buffer Request        74.20%       1.777ms        74.20%       1.777ms       1.777ms       8.480us        11.39%       8.480us       8.480us             1  
-                                    aten::empty_strided         1.32%      31.711us         1.32%      31.711us       5.285us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         9.06%     217.015us         9.06%     217.015us      36.169us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.30%      31.251us         1.69%      40.431us       3.369us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.38%       9.180us         0.38%       9.180us       0.765us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         1.79%      42.933us         1.79%      42.933us       7.155us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.22%       5.191us         0.22%       5.191us       5.191us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     325.696us       437.95%     325.696us     325.696us             1  
+                                      hf_kernels_rotary         7.25%     170.323us        99.77%       2.344ms       2.344ms       0.000us         0.00%      82.880us      82.880us             1  
+                                            aten::clone         1.13%      26.590us        87.43%       2.054ms     342.324us       0.000us         0.00%      44.000us       7.333us             6  
+                                            aten::copy_         1.50%      35.240us        85.03%       1.997ms     332.900us      35.488us        47.72%      44.000us       7.333us             6  
+                          _rotary_dba7d1e::apply_rotary         1.74%      40.881us         3.46%      81.232us      13.539us      38.880us        52.28%      38.880us       6.480us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      38.880us        52.28%      38.880us       6.480us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      35.488us        47.72%      35.488us       5.915us             6  
+                                Activity Buffer Request        75.65%       1.777ms        75.65%       1.777ms       1.777ms       8.512us        11.45%       8.512us       8.512us             1  
+                                    aten::empty_strided         1.27%      29.951us         1.27%      29.951us       4.992us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         7.87%     184.984us         7.87%     184.984us      30.831us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.28%      30.151us         1.62%      38.112us       3.176us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.34%       7.961us         0.34%       7.961us       0.663us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         1.72%      40.351us         1.72%      40.351us       6.725us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.23%       5.500us         0.23%       5.500us       5.500us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.395ms
-Self CUDA time total: 74.433us
+Self CPU time total: 2.349ms
+Self CUDA time total: 74.368us
 
 
 
@@ -4217,23 +4217,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     332.929us       819.86%     332.929us     332.929us             1  
-                                      hf_kernels_rotary         7.07%     166.848us        99.78%       2.353ms       2.353ms       0.000us         0.00%      43.488us      43.488us             1  
-                          _rotary_dba7d1e::apply_rotary         1.68%      39.560us         3.51%      82.681us      13.780us      23.488us        57.84%      23.488us       3.915us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      23.488us        57.84%      23.488us       3.915us             6  
-                                            aten::clone         1.16%      27.340us        87.49%       2.064ms     343.923us       0.000us         0.00%      20.000us       3.333us             6  
-                                            aten::copy_         1.51%      35.519us        85.00%       2.005ms     334.108us      17.120us        42.16%      20.000us       3.333us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.120us        42.16%      17.120us       2.853us             6  
-                                Activity Buffer Request        74.50%       1.757ms        74.50%       1.757ms       1.757ms       2.880us         7.09%       2.880us       2.880us             1  
-                                    aten::empty_strided         1.34%      31.550us         1.34%      31.550us       5.258us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         8.99%     212.096us         8.99%     212.096us      35.349us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.35%      31.900us         1.71%      40.350us       3.362us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.36%       8.450us         0.36%       8.450us       0.704us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         1.83%      43.121us         1.83%      43.121us       7.187us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.22%       5.120us         0.22%       5.120us       5.120us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     322.973us       797.86%     322.973us     322.973us             1  
+                                      hf_kernels_rotary         8.42%     188.754us        99.76%       2.236ms       2.236ms       0.000us         0.00%      43.328us      43.328us             1  
+                          _rotary_dba7d1e::apply_rotary         1.78%      39.922us         3.52%      78.932us      13.155us      23.456us        57.94%      23.456us       3.909us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      23.456us        57.94%      23.456us       3.909us             6  
+                                            aten::clone         1.19%      26.589us        86.07%       1.929ms     321.553us       0.000us         0.00%      19.872us       3.312us             6  
+                                            aten::copy_         1.47%      33.001us        83.54%       1.873ms     312.092us      17.024us        42.06%      19.872us       3.312us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.024us        42.06%      17.024us       2.837us             6  
+                                Activity Buffer Request        73.86%       1.656ms        73.86%       1.656ms       1.656ms       2.848us         7.04%       2.848us       2.848us             1  
+                                    aten::empty_strided         1.35%      30.180us         1.35%      30.180us       5.030us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         8.21%     183.975us         8.21%     183.975us      30.662us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.39%      31.060us         1.75%      39.122us       3.260us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.36%       8.062us         0.36%       8.062us       0.672us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         1.74%      39.010us         1.74%      39.010us       6.502us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.24%       5.491us         0.24%       5.491us       5.491us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.359ms
-Self CUDA time total: 40.608us
+Self CPU time total: 2.242ms
+Self CUDA time total: 40.480us
 
 
 
@@ -4243,23 +4243,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     338.142us       439.39%     338.142us     338.142us             1  
-                                      hf_kernels_rotary         7.05%     174.064us        99.80%       2.465ms       2.465ms       0.000us         0.00%      86.270us      86.270us             1  
-                                            aten::clone         1.20%      29.650us        87.84%       2.170ms     361.584us       0.000us         0.00%      47.071us       7.845us             6  
-                                            aten::copy_         1.42%      34.959us        85.36%       2.108ms     351.395us      37.759us        49.06%      47.071us       7.845us             6  
-                          _rotary_dba7d1e::apply_rotary         1.66%      41.022us         3.32%      82.043us      13.674us      39.199us        50.94%      39.199us       6.533us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      39.199us        50.94%      39.199us       6.533us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      37.759us        49.06%      37.759us       6.293us             6  
-                                Activity Buffer Request        75.49%       1.864ms        75.49%       1.864ms       1.864ms       9.312us        12.10%       9.312us       9.312us             1  
-                                    aten::empty_strided         1.27%      31.482us         1.27%      31.482us       5.247us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         8.46%     208.956us         8.46%     208.956us      34.826us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.25%      30.771us         1.59%      39.290us       3.274us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.34%       8.519us         0.34%       8.519us       0.710us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         1.66%      41.021us         1.66%      41.021us       6.837us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.20%       5.010us         0.20%       5.010us       5.010us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     323.101us       423.90%     323.101us     323.101us             1  
+                                      hf_kernels_rotary         7.30%     168.335us        99.78%       2.301ms       2.301ms       0.000us         0.00%      85.500us      85.500us             1  
+                                            aten::clone         1.11%      25.632us        87.37%       2.015ms     335.877us       0.000us         0.00%      46.364us       7.727us             6  
+                                            aten::copy_         1.44%      33.260us        84.98%       1.960ms     326.665us      37.085us        48.65%      46.364us       7.727us             6  
+                          _rotary_dba7d1e::apply_rotary         1.70%      39.159us         3.44%      79.421us      13.237us      39.136us        51.35%      39.136us       6.523us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      39.136us        51.35%      39.136us       6.523us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      37.085us        48.65%      37.085us       6.181us             6  
+                                Activity Buffer Request        75.65%       1.745ms        75.65%       1.745ms       1.745ms       9.279us        12.17%       9.279us       9.279us             1  
+                                    aten::empty_strided         1.29%      29.640us         1.29%      29.640us       4.940us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         7.89%     181.984us         7.89%     181.984us      30.331us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.33%      30.620us         1.67%      38.470us       3.206us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.34%       7.850us         0.34%       7.850us       0.654us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         1.75%      40.262us         1.75%      40.262us       6.710us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.22%       5.000us         0.22%       5.000us       5.000us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.470ms
-Self CUDA time total: 76.958us
+Self CPU time total: 2.306ms
+Self CUDA time total: 76.221us
 
 
 
@@ -4269,23 +4269,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     343.679us       247.41%     343.679us     343.679us             1  
-                                      hf_kernels_rotary         7.47%     171.684us        99.76%       2.294ms       2.294ms       0.000us         0.00%     162.559us     162.559us             1  
-                                            aten::clone         1.25%      28.681us        86.89%       1.998ms     333.015us       0.000us         0.00%     102.592us      17.099us             6  
-                                            aten::copy_         1.49%      34.332us        84.20%       1.936ms     322.703us      78.944us        56.83%     102.592us      17.099us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      78.944us        56.83%      78.944us      13.157us             6  
-                          _rotary_dba7d1e::apply_rotary         1.83%      42.151us         3.66%      84.183us      14.030us      59.967us        43.17%      59.967us       9.995us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      59.967us        43.17%      59.967us       9.995us             6  
-                                Activity Buffer Request        73.81%       1.697ms        73.81%       1.697ms       1.697ms      23.648us        17.02%      23.648us      23.648us             1  
-                                    aten::empty_strided         1.44%      33.190us         1.44%      33.190us       5.532us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         8.89%     204.504us         8.89%     204.504us      34.084us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.37%      31.511us         1.74%      40.120us       3.343us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.37%       8.609us         0.37%       8.609us       0.717us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         1.83%      42.032us         1.83%      42.032us       7.005us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.24%       5.461us         0.24%       5.461us       5.461us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     328.643us       235.88%     328.643us     328.643us             1  
+                                      hf_kernels_rotary         7.36%     166.641us        99.76%       2.258ms       2.258ms       0.000us         0.00%     163.042us     163.042us             1  
+                                            aten::clone         1.33%      30.021us        87.18%       1.973ms     328.842us       0.000us         0.00%     102.882us      17.147us             6  
+                                            aten::copy_         1.51%      34.171us        84.51%       1.913ms     318.757us      79.169us        56.82%     102.882us      17.147us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      79.169us        56.82%      79.169us      13.195us             6  
+                          _rotary_dba7d1e::apply_rotary         1.75%      39.570us         3.52%      79.562us      13.260us      60.160us        43.18%      60.160us      10.027us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      60.160us        43.18%      60.160us      10.027us             6  
+                                Activity Buffer Request        74.82%       1.693ms        74.82%       1.693ms       1.693ms      23.713us        17.02%      23.713us      23.713us             1  
+                                    aten::empty_strided         1.35%      30.491us         1.35%      30.491us       5.082us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         8.17%     185.005us         8.17%     185.005us      30.834us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.35%      30.551us         1.70%      38.372us       3.198us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.35%       7.821us         0.35%       7.821us       0.652us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         1.77%      39.992us         1.77%      39.992us       6.665us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.24%       5.501us         0.24%       5.501us       5.501us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.300ms
-Self CUDA time total: 138.911us
+Self CPU time total: 2.263ms
+Self CUDA time total: 139.329us
 
 
 
@@ -4295,23 +4295,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         6.47%     175.111us        88.03%       2.384ms       2.384ms       0.000us         0.00%     770.847us     770.847us             1  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     713.695us       101.13%     713.695us     713.695us             1  
-                                            aten::clone         1.03%      27.812us        76.48%       2.071ms     345.215us       0.000us         0.00%     568.671us      94.778us             6  
-                                            aten::copy_         1.35%      36.632us        74.30%       2.012ms     335.367us     503.551us        71.35%     568.671us      94.778us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     503.551us        71.35%     503.551us      83.925us             6  
-                          _rotary_dba7d1e::apply_rotary         1.88%      50.972us         3.57%      96.775us      16.129us     202.176us        28.65%     202.176us      33.696us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us     202.176us        28.65%     202.176us      33.696us             6  
-                                Activity Buffer Request        65.43%       1.772ms        65.43%       1.772ms       1.772ms      65.120us         9.23%      65.120us      65.120us             1  
-                                    aten::empty_strided         1.15%      31.280us         1.15%      31.280us       5.213us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         7.52%     203.605us         7.52%     203.605us      33.934us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.18%      32.050us         1.51%      41.000us       3.417us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.33%       8.950us         0.33%       8.950us       0.746us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         1.69%      45.803us         1.69%      45.803us       7.634us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        11.97%     324.077us        11.97%     324.077us     324.077us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         6.40%     173.445us        87.62%       2.374ms       2.374ms       0.000us         0.00%     766.307us     766.307us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     707.554us       101.09%     707.554us     707.554us             1  
+                                            aten::clone         0.97%      26.320us        76.46%       2.071ms     345.229us       0.000us         0.00%     566.210us      94.368us             6  
+                                            aten::copy_         1.24%      33.572us        74.40%       2.016ms     335.956us     499.842us        71.41%     566.210us      94.368us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     499.842us        71.41%     499.842us      83.307us             6  
+                          _rotary_dba7d1e::apply_rotary         1.84%      49.730us         3.34%      90.461us      15.077us     200.097us        28.59%     200.097us      33.350us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us     200.097us        28.59%     200.097us      33.350us             6  
+                                Activity Buffer Request        66.43%       1.800ms        66.43%       1.800ms       1.800ms      66.368us         9.48%      66.368us      66.368us             1  
+                                    aten::empty_strided         1.08%      29.321us         1.08%      29.321us       4.887us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         6.73%     182.313us         6.73%     182.313us      30.385us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.12%      30.440us         1.42%      38.441us       3.203us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.30%       8.001us         0.30%       8.001us       0.667us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         1.50%      40.731us         1.50%      40.731us       6.788us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        12.38%     335.537us        12.38%     335.537us     335.537us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.708ms
-Self CUDA time total: 705.727us
+Self CPU time total: 2.709ms
+Self CUDA time total: 699.939us
 
 
 
@@ -4321,23 +4321,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     336.477us      1265.33%     336.477us     336.477us             1  
-                                      hf_kernels_rotary         7.39%     171.995us        99.77%       2.322ms       2.322ms       0.000us         0.00%      27.904us      27.904us             1  
-                          _rotary_dba7d1e::apply_rotary         1.76%      41.061us         3.57%      83.061us      13.844us      18.816us        70.76%      18.816us       3.136us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      18.816us        70.76%      18.816us       3.136us             6  
-                                            aten::clone         1.24%      28.803us        87.11%       2.027ms     337.843us       0.000us         0.00%       9.088us       1.515us             6  
-                                            aten::copy_         1.64%      38.082us        84.50%       1.966ms     327.735us       7.776us        29.24%       9.088us       1.515us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.776us        29.24%       7.776us       1.296us             6  
-                                Activity Buffer Request        72.85%       1.695ms        72.85%       1.695ms       1.695ms       1.312us         4.93%       1.312us       1.312us             1  
-                                    aten::empty_strided         1.37%      31.849us         1.37%      31.849us       5.308us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        10.01%     232.925us        10.01%     232.925us      38.821us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.33%      30.872us         1.71%      39.700us       3.308us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.38%       8.828us         0.38%       8.828us       0.736us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         1.80%      42.000us         1.80%      42.000us       7.000us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.23%       5.320us         0.23%       5.320us       5.320us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     326.335us      1221.36%     326.335us     326.335us             1  
+                                      hf_kernels_rotary         7.45%     166.364us        99.75%       2.229ms       2.229ms       0.000us         0.00%      28.063us      28.063us             1  
+                          _rotary_dba7d1e::apply_rotary         1.91%      42.732us         3.76%      83.973us      13.996us      18.815us        70.42%      18.815us       3.136us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      18.815us        70.42%      18.815us       3.136us             6  
+                                            aten::clone         1.19%      26.589us        86.78%       1.939ms     323.190us       0.000us         0.00%       9.248us       1.541us             6  
+                                            aten::copy_         1.45%      32.469us        84.19%       1.881ms     313.518us       7.904us        29.58%       9.248us       1.541us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.904us        29.58%       7.904us       1.317us             6  
+                                Activity Buffer Request        74.65%       1.668ms        74.65%       1.668ms       1.668ms       1.344us         5.03%       1.344us       1.344us             1  
+                                    aten::empty_strided         1.41%      31.442us         1.41%      31.442us       5.240us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         8.08%     180.586us         8.08%     180.586us      30.098us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.40%      31.310us         1.76%      39.389us       3.282us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.36%       8.079us         0.36%       8.079us       0.673us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         1.85%      41.241us         1.85%      41.241us       6.873us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.25%       5.600us         0.25%       5.600us       5.600us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.327ms
-Self CUDA time total: 26.592us
+Self CPU time total: 2.234ms
+Self CUDA time total: 26.719us
 
 
 
@@ -4347,23 +4347,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     361.593us      1356.47%     361.593us     361.593us             1  
-                                      hf_kernels_rotary        20.01%     156.574us        99.29%     776.889us     776.889us       0.000us         0.00%      27.969us      27.969us             1  
-                          _rotary_dba7d1e::apply_rotary         6.08%      47.572us        11.90%      93.114us      15.519us      18.881us        70.83%      18.881us       3.147us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      18.881us        70.83%      18.881us       3.147us             6  
-                                            aten::clone         2.86%      22.401us        61.73%     483.012us      80.502us       0.000us         0.00%       9.088us       1.515us             6  
-                                            aten::copy_         5.02%      39.248us        54.77%     428.540us      71.423us       7.776us        29.17%       9.088us       1.515us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.776us        29.17%       7.776us       1.296us             6  
-                                Activity Buffer Request        23.43%     183.305us        23.43%     183.305us     183.305us       1.312us         4.92%       1.312us       1.312us             1  
-                                    aten::empty_strided         4.10%      32.071us         4.10%      32.071us       5.345us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        26.33%     205.987us        26.33%     205.987us      34.331us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.44%      34.709us         5.65%      44.189us       3.682us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.21%       9.480us         1.21%       9.480us       0.790us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         5.82%      45.542us         5.82%      45.542us       7.590us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.71%       5.560us         0.71%       5.560us       5.560us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     321.948us      1206.29%     321.948us     321.948us             1  
+                                      hf_kernels_rotary        18.26%     148.741us        99.36%     809.528us     809.528us       0.000us         0.00%      28.001us      28.001us             1  
+                          _rotary_dba7d1e::apply_rotary         5.08%      41.412us        10.07%      82.013us      13.669us      18.880us        70.74%      18.880us       3.147us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      18.880us        70.74%      18.880us       3.147us             6  
+                                            aten::clone         2.30%      18.723us        66.05%     538.143us      89.691us       0.000us         0.00%       9.121us       1.520us             6  
+                                            aten::copy_         4.35%      35.479us        60.21%     490.570us      81.762us       7.809us        29.26%       9.121us       1.520us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.809us        29.26%       7.809us       1.301us             6  
+                                Activity Buffer Request        33.00%     268.886us        33.00%     268.886us     268.886us       1.312us         4.92%       1.312us       1.312us             1  
+                                    aten::empty_strided         3.54%      28.850us         3.54%      28.850us       4.808us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        22.85%     186.205us        22.85%     186.205us      31.034us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.00%      32.551us         4.99%      40.631us       3.386us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.99%       8.080us         0.99%       8.080us       0.673us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         4.98%      40.601us         4.98%      40.601us       6.767us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.64%       5.220us         0.64%       5.220us       5.220us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 782.449us
-Self CUDA time total: 26.657us
+Self CPU time total: 814.748us
+Self CUDA time total: 26.689us
 
 
 
@@ -4373,23 +4373,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     329.530us      1073.84%     329.530us     329.530us             1  
-                                      hf_kernels_rotary        18.69%     150.673us        99.29%     800.610us     800.610us       0.000us         0.00%      32.447us      32.447us             1  
-                          _rotary_dba7d1e::apply_rotary         5.09%      41.061us        10.37%      83.622us      13.937us      20.159us        65.69%      20.159us       3.360us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      20.159us        65.69%      20.159us       3.360us             6  
-                                            aten::clone         2.45%      19.762us        65.24%     526.013us      87.669us       0.000us         0.00%      12.288us       2.048us             6  
-                                            aten::copy_         4.31%      34.749us        58.96%     475.401us      79.234us      10.528us        34.31%      12.288us       2.048us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.528us        34.31%      10.528us       1.755us             6  
-                                Activity Buffer Request        29.87%     240.876us        29.87%     240.876us     240.876us       1.760us         5.74%       1.760us       1.760us             1  
-                                    aten::empty_strided         3.83%      30.850us         3.83%      30.850us       5.142us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        24.78%     199.776us        24.78%     199.776us      33.296us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.92%      31.582us         5.00%      40.302us       3.358us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.08%       8.720us         1.08%       8.720us       0.727us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         5.28%      42.561us         5.28%      42.561us       7.094us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.71%       5.701us         0.71%       5.701us       5.701us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     317.754us      1035.40%     317.754us     317.754us             1  
+                                      hf_kernels_rotary        20.08%     146.926us        99.34%     726.766us     726.766us       0.000us         0.00%      32.481us      32.481us             1  
+                          _rotary_dba7d1e::apply_rotary         5.67%      41.461us        11.01%      80.552us      13.425us      20.192us        65.80%      20.192us       3.365us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      20.192us        65.80%      20.192us       3.365us             6  
+                                            aten::clone         2.73%      19.937us        62.86%     459.858us      76.643us       0.000us         0.00%      12.289us       2.048us             6  
+                                            aten::copy_         4.61%      33.700us        56.15%     410.789us      68.465us      10.497us        34.20%      12.289us       2.048us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.497us        34.20%      10.497us       1.750us             6  
+                                Activity Buffer Request        26.92%     196.965us        26.92%     196.965us     196.965us       1.792us         5.84%       1.792us       1.792us             1  
+                                    aten::empty_strided         3.98%      29.132us         3.98%      29.132us       4.855us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        24.62%     180.124us        24.62%     180.124us      30.021us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.28%      31.330us         5.39%      39.430us       3.286us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.11%       8.100us         1.11%       8.100us       0.675us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.34%      39.091us         5.34%      39.091us       6.515us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.66%       4.850us         0.66%       4.850us       4.850us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 806.311us
-Self CUDA time total: 30.687us
+Self CPU time total: 731.616us
+Self CUDA time total: 30.689us
 
 
 
@@ -4399,23 +4399,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     329.435us       772.31%     329.435us     329.435us             1  
-                                      hf_kernels_rotary        20.10%     145.342us        99.21%     717.528us     717.528us       0.000us         0.00%      45.504us      45.504us             1  
-                          _rotary_dba7d1e::apply_rotary         5.72%      41.400us        11.55%      83.552us      13.925us      25.568us        59.94%      25.568us       4.261us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      25.568us        59.94%      25.568us       4.261us             6  
-                                            aten::clone         2.77%      20.032us        61.99%     448.302us      74.717us       0.000us         0.00%      19.936us       3.323us             6  
-                                            aten::copy_         4.93%      35.690us        55.04%     398.089us      66.348us      17.088us        40.06%      19.936us       3.323us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.088us        40.06%      17.088us       2.848us             6  
-                                Activity Buffer Request        22.43%     162.214us        22.43%     162.214us     162.214us       2.848us         6.68%       2.848us       2.848us             1  
-                                    aten::empty_strided         4.17%      30.181us         4.17%      30.181us       5.030us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        27.68%     200.185us        27.68%     200.185us      33.364us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.37%      31.593us         5.58%      40.332us       3.361us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.21%       8.739us         1.21%       8.739us       0.728us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         5.83%      42.152us         5.83%      42.152us       7.025us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.79%       5.700us         0.79%       5.700us       5.700us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     319.429us       749.90%     319.429us     319.429us             1  
+                                      hf_kernels_rotary        19.19%     146.865us        99.27%     759.737us     759.737us       0.000us         0.00%      45.476us      45.476us             1  
+                          _rotary_dba7d1e::apply_rotary         5.51%      42.192us        10.66%      81.562us      13.594us      25.698us        60.33%      25.698us       4.283us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      25.698us        60.33%      25.698us       4.283us             6  
+                                            aten::clone         2.51%      19.231us        64.35%     492.451us      82.075us       0.000us         0.00%      19.778us       3.296us             6  
+                                            aten::copy_         4.10%      31.370us        57.94%     443.419us      73.903us      16.898us        39.67%      19.778us       3.296us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.898us        39.67%      16.898us       2.816us             6  
+                                Activity Buffer Request        30.11%     230.405us        30.11%     230.405us     230.405us       2.880us         6.76%       2.880us       2.880us             1  
+                                    aten::empty_strided         3.89%      29.801us         3.89%      29.801us       4.967us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        23.73%     181.644us        23.73%     181.644us      30.274us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.99%      30.570us         5.08%      38.859us       3.238us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.08%       8.289us         1.08%       8.289us       0.691us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.14%      39.370us         5.14%      39.370us       6.562us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.73%       5.590us         0.73%       5.590us       5.590us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 723.228us
-Self CUDA time total: 42.656us
+Self CPU time total: 765.327us
+Self CUDA time total: 42.596us
 
 
 
@@ -4425,23 +4425,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     330.651us      1075.32%     330.651us     330.651us             1  
-                                      hf_kernels_rotary        18.72%     150.931us        99.33%     800.790us     800.790us       0.000us         0.00%      32.508us      32.508us             1  
-                          _rotary_dba7d1e::apply_rotary         5.21%      41.984us        10.24%      82.545us      13.758us      20.318us        66.08%      20.318us       3.386us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      20.318us        66.08%      20.318us       3.386us             6  
-                                            aten::clone         2.51%      20.253us        65.40%     527.224us      87.871us       0.000us         0.00%      12.190us       2.032us             6  
-                                            aten::copy_         4.39%      35.371us        59.06%     476.161us      79.360us      10.431us        33.92%      12.190us       2.032us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.431us        33.92%      10.431us       1.738us             6  
-                                Activity Buffer Request        30.20%     243.496us        30.20%     243.496us     243.496us       1.759us         5.72%       1.759us       1.759us             1  
-                                    aten::empty_strided         3.82%      30.810us         3.82%      30.810us       5.135us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        24.47%     197.294us        24.47%     197.294us      32.882us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.87%      31.220us         4.97%      40.090us       3.341us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.10%       8.870us         1.10%       8.870us       0.739us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         5.03%      40.561us         5.03%      40.561us       6.760us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.67%       5.380us         0.67%       5.380us       5.380us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     322.049us      1047.28%     322.049us     322.049us             1  
+                                      hf_kernels_rotary        18.29%     149.504us        99.36%     812.348us     812.348us       0.000us         0.00%      32.511us      32.511us             1  
+                          _rotary_dba7d1e::apply_rotary         5.08%      41.522us        10.02%      81.922us      13.654us      20.224us        65.77%      20.224us       3.371us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      20.224us        65.77%      20.224us       3.371us             6  
+                                            aten::clone         2.45%      20.060us        66.16%     540.872us      90.145us       0.000us         0.00%      12.287us       2.048us             6  
+                                            aten::copy_         4.16%      33.972us        60.11%     491.422us      81.904us      10.527us        34.23%      12.287us       2.048us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.527us        34.23%      10.527us       1.754us             6  
+                                Activity Buffer Request        32.85%     268.566us        32.85%     268.566us     268.566us       1.760us         5.72%       1.760us       1.760us             1  
+                                    aten::empty_strided         3.59%      29.390us         3.59%      29.390us       4.898us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        23.10%     188.884us        23.10%     188.884us      31.481us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.91%      31.930us         4.90%      40.050us       3.337us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.99%       8.120us         0.99%       8.120us       0.677us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         4.94%      40.400us         4.94%      40.400us       6.733us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.64%       5.230us         0.64%       5.230us       5.230us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 806.170us
-Self CUDA time total: 30.749us
+Self CPU time total: 817.578us
+Self CUDA time total: 30.751us
 
 
 
@@ -4451,23 +4451,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     330.616us       770.43%     330.616us     330.616us             1  
-                                      hf_kernels_rotary        17.99%     148.834us        99.35%     822.020us     822.020us       0.000us         0.00%      45.761us      45.761us             1  
-                          _rotary_dba7d1e::apply_rotary         5.00%      41.361us        10.40%      86.012us      14.335us      25.857us        60.25%      25.857us       4.310us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      25.857us        60.25%      25.857us       4.310us             6  
-                                            aten::clone         2.40%      19.879us        66.22%     547.932us      91.322us       0.000us         0.00%      19.904us       3.317us             6  
-                                            aten::copy_         4.29%      35.520us        60.10%     497.262us      82.877us      17.056us        39.75%      19.904us       3.317us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.056us        39.75%      17.056us       2.843us             6  
-                                Activity Buffer Request        32.00%     264.767us        32.00%     264.767us     264.767us       2.848us         6.64%       2.848us       2.848us             1  
-                                    aten::empty_strided         3.72%      30.791us         3.72%      30.791us       5.132us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        23.81%     196.975us        23.81%     196.975us      32.829us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.76%      31.140us         4.74%      39.242us       3.270us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.98%       8.102us         0.98%       8.102us       0.675us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         5.40%      44.651us         5.40%      44.651us       7.442us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.65%       5.369us         0.65%       5.369us       5.369us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     316.538us       739.30%     316.538us     316.538us             1  
+                                      hf_kernels_rotary        19.03%     145.425us        99.27%     758.787us     758.787us       0.000us         0.00%      45.664us      45.664us             1  
+                          _rotary_dba7d1e::apply_rotary         5.13%      39.179us        10.36%      79.161us      13.194us      25.792us        60.24%      25.792us       4.299us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      25.792us        60.24%      25.792us       4.299us             6  
+                                            aten::clone         2.60%      19.840us        64.72%     494.691us      82.449us       0.000us         0.00%      19.872us       3.312us             6  
+                                            aten::copy_         4.32%      33.011us        58.33%     445.830us      74.305us      17.024us        39.76%      19.872us       3.312us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.024us        39.76%      17.024us       2.837us             6  
+                                Activity Buffer Request        30.09%     230.025us        30.09%     230.025us     230.025us       2.848us         6.65%       2.848us       2.848us             1  
+                                    aten::empty_strided         3.80%      29.021us         3.80%      29.021us       4.837us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        23.92%     182.794us        23.92%     182.794us      30.466us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.11%      31.419us         5.17%      39.510us       3.293us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.06%       8.091us         1.06%       8.091us       0.674us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.23%      39.982us         5.23%      39.982us       6.664us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.73%       5.560us         0.73%       5.560us       5.560us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 827.389us
-Self CUDA time total: 42.913us
+Self CPU time total: 764.347us
+Self CUDA time total: 42.816us
 
 
 
@@ -4477,23 +4477,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     334.653us       358.15%     334.653us     334.653us             1  
-                                      hf_kernels_rotary        18.09%     151.114us        99.35%     829.780us     829.780us       0.000us         0.00%     109.215us     109.215us             1  
-                                            aten::clone         3.33%      27.839us        66.37%     554.323us      92.387us       0.000us         0.00%      68.064us      11.344us             6  
-                                            aten::copy_         4.18%      34.911us        59.27%     495.081us      82.513us      52.288us        55.96%      68.064us      11.344us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      52.288us        55.96%      52.288us       8.715us             6  
-                          _rotary_dba7d1e::apply_rotary         4.95%      41.342us         9.97%      83.303us      13.884us      41.151us        44.04%      41.151us       6.858us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      41.151us        44.04%      41.151us       6.858us             6  
-                                Activity Buffer Request        31.64%     264.256us        31.64%     264.256us     264.256us      15.776us        16.88%      15.776us      15.776us             1  
-                                    aten::empty_strided         3.76%      31.403us         3.76%      31.403us       5.234us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        23.46%     195.914us        23.46%     195.914us      32.652us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.87%      32.310us         4.91%      41.040us       3.420us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.05%       8.730us         1.05%       8.730us       0.728us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         5.02%      41.961us         5.02%      41.961us       6.994us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.65%       5.470us         0.65%       5.470us       5.470us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     320.924us       345.95%     320.924us     320.924us             1  
+                                      hf_kernels_rotary        18.21%     146.024us        99.34%     796.707us     796.707us       0.000us         0.00%     108.351us     108.351us             1  
+                                            aten::clone         2.45%      19.620us        65.99%     529.232us      88.205us       0.000us         0.00%      67.071us      11.179us             6  
+                                            aten::copy_         4.16%      33.361us        60.01%     481.261us      80.210us      51.487us        55.50%      67.071us      11.179us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      51.487us        55.50%      51.487us       8.581us             6  
+                          _rotary_dba7d1e::apply_rotary         5.00%      40.060us        10.26%      82.291us      13.715us      41.280us        44.50%      41.280us       6.880us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      41.280us        44.50%      41.280us       6.880us             6  
+                                Activity Buffer Request        33.32%     267.235us        33.32%     267.235us     267.235us      15.584us        16.80%      15.584us      15.584us             1  
+                                    aten::empty_strided         3.54%      28.351us         3.54%      28.351us       4.725us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        22.53%     180.665us        22.53%     180.665us      30.111us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.88%      31.108us         4.88%      39.160us       3.263us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.00%       8.052us         1.00%       8.052us       0.671us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.27%      42.231us         5.27%      42.231us       7.038us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.66%       5.290us         0.66%       5.290us       5.290us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 835.250us
-Self CUDA time total: 93.439us
+Self CPU time total: 801.997us
+Self CUDA time total: 92.767us
 
 
 
@@ -4503,23 +4503,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     402.293us       278.14%     402.293us     402.293us             1  
-                                      hf_kernels_rotary        18.23%     162.247us        99.42%     884.932us     884.932us       0.000us         0.00%     168.347us     168.347us             1  
-                                            aten::clone         2.56%      22.809us        61.35%     546.082us      91.014us       0.000us         0.00%     105.212us      17.535us             6  
-                                            aten::copy_         3.90%      34.711us        55.16%     490.942us      81.824us      81.501us        56.35%     105.212us      17.535us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      81.501us        56.35%      81.501us      13.584us             6  
-                          _rotary_dba7d1e::apply_rotary         4.99%      44.421us        15.06%     134.003us      22.334us      63.135us        43.65%      63.135us      10.522us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      63.135us        43.65%      63.135us      10.522us             6  
-                                Activity Buffer Request        29.54%     262.907us        29.54%     262.907us     262.907us      23.711us        16.39%      23.711us      23.711us             1  
-                                    aten::empty_strided         3.63%      32.331us         3.63%      32.331us       5.388us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        21.72%     193.324us        21.72%     193.324us      32.221us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.75%      33.378us         4.79%      42.600us       3.550us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.04%       9.222us         1.04%       9.222us       0.769us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel        10.06%      89.582us        10.06%      89.582us      14.930us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.58%       5.120us         0.58%       5.120us       5.120us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     328.056us       226.11%     328.056us     328.056us             1  
+                                      hf_kernels_rotary        19.08%     156.414us        99.34%     814.518us     814.518us       0.000us         0.00%     168.799us     168.799us             1  
+                                            aten::clone         3.19%      26.153us        65.07%     533.532us      88.922us       0.000us         0.00%     105.343us      17.557us             6  
+                                            aten::copy_         3.98%      32.659us        58.36%     478.499us      79.750us      81.631us        56.26%     105.343us      17.557us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      81.631us        56.26%      81.631us      13.605us             6  
+                          _rotary_dba7d1e::apply_rotary         5.18%      42.511us        10.36%      84.962us      14.160us      63.456us        43.74%      63.456us      10.576us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      63.456us        43.74%      63.456us      10.576us             6  
+                                Activity Buffer Request        32.74%     268.426us        32.74%     268.426us     268.426us      23.712us        16.34%      23.712us      23.712us             1  
+                                    aten::empty_strided         3.52%      28.880us         3.52%      28.880us       4.813us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        21.64%     177.414us        21.64%     177.414us      29.569us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.88%      31.780us         4.83%      39.610us       3.301us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.95%       7.830us         0.95%       7.830us       0.652us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.18%      42.451us         5.18%      42.451us       7.075us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.66%       5.440us         0.66%       5.440us       5.440us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 890.052us
-Self CUDA time total: 144.636us
+Self CPU time total: 819.958us
+Self CUDA time total: 145.087us
 
 
 
@@ -4529,23 +4529,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     335.030us       423.20%     335.030us     335.030us             1  
-                                      hf_kernels_rotary        17.60%     149.462us        99.35%     843.781us     843.781us       0.000us         0.00%      89.342us      89.342us             1  
-                                            aten::clone         2.76%      23.471us        67.14%     570.174us      95.029us       0.000us         0.00%      47.328us       7.888us             6  
-                                            aten::copy_         4.78%      40.570us        60.71%     515.622us      85.937us      37.152us        46.93%      47.328us       7.888us             6  
-                          _rotary_dba7d1e::apply_rotary         4.86%      41.293us         9.90%      84.043us      14.007us      42.014us        53.07%      42.014us       7.002us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      42.014us        53.07%      42.014us       7.002us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      37.152us        46.93%      37.152us       6.192us             6  
-                                Activity Buffer Request        33.04%     280.637us        33.04%     280.637us     280.637us      10.176us        12.85%      10.176us      10.176us             1  
-                                    aten::empty_strided         3.66%      31.081us         3.66%      31.081us       5.180us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        22.89%     194.415us        22.89%     194.415us      32.403us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.65%      30.971us         4.72%      40.102us       3.342us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.08%       9.131us         1.08%       9.131us       0.761us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         5.03%      42.750us         5.03%      42.750us       7.125us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.65%       5.500us         0.65%       5.500us       5.500us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     333.210us       419.37%     333.210us     333.210us             1  
+                                      hf_kernels_rotary        17.73%     150.595us        99.35%     844.049us     844.049us       0.000us         0.00%      89.630us      89.630us             1  
+                                            aten::clone         2.52%      21.432us        67.08%     569.832us      94.972us       0.000us         0.00%      47.583us       7.930us             6  
+                                            aten::copy_         3.93%      33.381us        61.05%     518.601us      86.433us      37.407us        47.08%      47.583us       7.930us             6  
+                          _rotary_dba7d1e::apply_rotary         4.77%      40.491us         9.89%      84.021us      14.004us      42.047us        52.92%      42.047us       7.008us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      42.047us        52.92%      42.047us       7.008us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      37.407us        47.08%      37.407us       6.235us             6  
+                                Activity Buffer Request        36.09%     306.637us        36.09%     306.637us     306.637us      10.176us        12.81%      10.176us      10.176us             1  
+                                    aten::empty_strided         3.51%      29.799us         3.51%      29.799us       4.966us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        21.02%     178.583us        21.02%     178.583us      29.764us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.71%      31.542us         4.66%      39.601us       3.300us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.95%       8.059us         0.95%       8.059us       0.672us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.12%      43.530us         5.12%      43.530us       7.255us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.65%       5.480us         0.65%       5.480us       5.480us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 849.281us
-Self CUDA time total: 79.166us
+Self CPU time total: 849.529us
+Self CUDA time total: 79.454us
 
 
 
@@ -4555,23 +4555,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     335.451us       231.31%     335.451us     335.451us             1  
-                                      hf_kernels_rotary        20.10%     149.473us        99.28%     738.378us     738.378us       0.000us         0.00%     168.637us     168.637us             1  
-                                            aten::clone         2.71%      20.190us        62.55%     465.160us      77.527us       0.000us         0.00%     104.892us      17.482us             6  
-                                            aten::copy_         4.80%      35.671us        55.64%     413.780us      68.963us      81.277us        56.04%     104.892us      17.482us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      81.277us        56.04%      81.277us      13.546us             6  
-                          _rotary_dba7d1e::apply_rotary         5.48%      40.762us        11.33%      84.273us      14.046us      63.745us        43.96%      63.745us      10.624us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      63.745us        43.96%      63.745us      10.624us             6  
-                                Activity Buffer Request        25.22%     187.575us        25.22%     187.575us     187.575us      23.615us        16.28%      23.615us      23.615us             1  
-                                    aten::empty_strided         4.19%      31.190us         4.19%      31.190us       5.198us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        25.62%     190.534us        25.62%     190.534us      31.756us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.11%      30.580us         5.31%      39.472us       3.289us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.20%       8.892us         1.20%       8.892us       0.741us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         5.85%      43.511us         5.85%      43.511us       7.252us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.72%       5.340us         0.72%       5.340us       5.340us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     324.316us       222.02%     324.316us     324.316us             1  
+                                      hf_kernels_rotary        18.22%     148.243us        99.33%     808.388us     808.388us       0.000us         0.00%     169.820us     169.820us             1  
+                                            aten::clone         2.32%      18.910us        66.43%     540.572us      90.095us       0.000us         0.00%     105.630us      17.605us             6  
+                                            aten::copy_         4.07%      33.161us        60.40%     491.511us      81.919us      81.886us        56.06%     105.630us      17.605us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      81.886us        56.06%      81.886us      13.648us             6  
+                          _rotary_dba7d1e::apply_rotary         4.96%      40.400us        10.02%      81.542us      13.590us      64.190us        43.94%      64.190us      10.698us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      64.190us        43.94%      64.190us      10.698us             6  
+                                Activity Buffer Request        33.97%     276.476us        33.97%     276.476us     276.476us      23.744us        16.25%      23.744us      23.744us             1  
+                                    aten::empty_strided         3.70%      30.151us         3.70%      30.151us       5.025us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        22.35%     181.874us        22.35%     181.874us      30.312us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.71%      30.201us         4.67%      38.031us       3.169us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.96%       7.830us         0.96%       7.830us       0.652us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.06%      41.142us         5.06%      41.142us       6.857us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.67%       5.420us         0.67%       5.420us       5.420us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 743.718us
-Self CUDA time total: 145.022us
+Self CPU time total: 813.808us
+Self CUDA time total: 146.076us
 
 
 
@@ -4581,23 +4581,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary        13.47%     148.469us        71.31%     785.819us     785.819us       0.000us         0.00%     741.458us     741.458us             1  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     683.092us       101.20%     683.092us     683.092us             1  
-                                            aten::clone         1.91%      21.000us        46.53%     512.672us      85.445us       0.000us         0.00%     558.390us      93.065us             6  
-                                            aten::copy_         3.10%      34.171us        41.83%     460.962us      76.827us     491.927us        72.88%     558.390us      93.065us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     491.927us        72.88%     491.927us      81.988us             6  
-                          _rotary_dba7d1e::apply_rotary         3.81%      41.992us         7.74%      85.293us      14.215us     183.068us        27.12%     183.068us      30.511us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us     183.068us        27.12%     183.068us      30.511us             6  
-                                Activity Buffer Request        21.20%     233.636us        21.20%     233.636us     233.636us      66.463us         9.85%      66.463us      66.463us             1  
-                                    aten::empty_strided         2.79%      30.710us         2.79%      30.710us       5.118us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        17.53%     193.155us        17.53%     193.155us      32.192us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.79%      30.724us         3.57%      39.385us       3.282us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.79%       8.661us         0.79%       8.661us       0.722us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         3.93%      43.301us         3.93%      43.301us       7.217us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        28.69%     316.097us        28.69%     316.097us     316.097us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary        13.82%     153.005us        74.75%     827.548us     827.548us       0.000us         0.00%     740.918us     740.918us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     682.102us       101.12%     682.102us     682.102us             1  
+                                            aten::clone         1.78%      19.750us        46.80%     518.112us      86.352us       0.000us         0.00%     558.489us      93.082us             6  
+                                            aten::copy_         3.14%      34.772us        42.33%     468.681us      78.114us     492.121us        72.96%     558.489us      93.082us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     492.121us        72.96%     492.121us      82.020us             6  
+                          _rotary_dba7d1e::apply_rotary         3.85%      42.609us         7.73%      85.591us      14.265us     182.429us        27.04%     182.429us      30.405us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us     182.429us        27.04%     182.429us      30.405us             6  
+                                Activity Buffer Request        22.69%     251.255us        22.69%     251.255us     251.255us      66.368us         9.84%      66.368us      66.368us             1  
+                                    aten::empty_strided         2.68%      29.681us         2.68%      29.681us       4.947us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        16.50%     182.654us        16.50%     182.654us      30.442us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         5.62%      62.189us         6.40%      70.840us       5.903us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.78%       8.651us         0.78%       8.651us       0.721us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         3.88%      42.982us         3.88%      42.982us       7.164us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        25.25%     279.606us        25.25%     279.606us     279.606us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.102ms
-Self CUDA time total: 674.995us
+Self CPU time total: 1.107ms
+Self CUDA time total: 674.550us
 
 
 
@@ -4607,30 +4607,30 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         5.24%     147.368us        25.60%     720.668us     720.668us       0.000us         0.00%       2.633ms       2.633ms             1  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us       2.458ms       100.33%       2.458ms       2.458ms             1  
-                                            aten::clone         0.73%      20.601us        15.98%     449.780us      74.963us       0.000us         0.00%       1.394ms     232.377us             6  
-                                            aten::copy_         1.20%      33.732us        14.16%     398.690us      66.448us       1.211ms        49.43%       1.394ms     232.377us             6  
-                          _rotary_dba7d1e::apply_rotary         1.46%      41.000us         3.02%      85.091us      14.182us       1.239ms        50.57%       1.239ms     206.500us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us       1.239ms        50.57%       1.239ms     206.500us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       1.211ms        49.43%       1.211ms     201.812us             6  
-                                Activity Buffer Request         6.06%     170.604us         6.06%     170.604us     170.604us     183.391us         7.49%     183.391us     183.391us             1  
-                                    aten::empty_strided         1.08%      30.489us         1.08%      30.489us       5.081us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         6.90%     194.354us         6.90%     194.354us      32.392us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.07%      30.231us         1.37%      38.429us       3.202us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.29%       8.198us         0.29%       8.198us       0.683us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         1.57%      44.091us         1.57%      44.091us       7.349us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        74.40%       2.094ms        74.40%       2.094ms       2.094ms       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         5.09%     145.551us        26.40%     755.006us     755.006us       0.000us         0.00%       2.627ms       2.627ms             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us       2.456ms       100.32%       2.456ms       2.456ms             1  
+                                            aten::clone         0.65%      18.699us        17.06%     488.000us      81.333us       0.000us         0.00%       1.401ms     233.479us             6  
+                                            aten::copy_         1.14%      32.589us        15.40%     440.469us      73.412us       1.223ms        49.95%       1.401ms     233.479us             6  
+                          _rotary_dba7d1e::apply_rotary         1.42%      40.751us         2.92%      83.492us      13.915us       1.226ms        50.05%       1.226ms     204.274us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us       1.226ms        50.05%       1.226ms     204.274us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       1.223ms        49.95%       1.223ms     203.836us             6  
+                                Activity Buffer Request         7.97%     227.955us         7.97%     227.955us     227.955us     177.858us         7.26%     177.858us     177.858us             1  
+                                    aten::empty_strided         1.01%      28.832us         1.01%      28.832us       4.805us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         6.29%     179.925us         6.29%     179.925us      29.987us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.05%      30.104us         1.33%      37.963us       3.164us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.27%       7.859us         0.27%       7.859us       0.655us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         1.49%      42.741us         1.49%      42.741us       7.124us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        73.60%       2.105ms        73.60%       2.105ms       2.105ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.815ms
-Self CUDA time total: 2.450ms
+Self CPU time total: 2.860ms
+Self CUDA time total: 2.449ms
 
 
 impl                     wl                  p50(ms)  ok
 hf_kernels_rotary        cuda_B1_S128_H32_D128_R64     0.09  True
 hf_kernels_rotary        cuda_B1_S128_H32_D64_R32     0.09  True
 hf_kernels_rotary        cuda_B1_S128_H8_D128_R64     0.09  True
-hf_kernels_rotary        cuda_B1_S128_H8_D64_R32     0.08  True
+hf_kernels_rotary        cuda_B1_S128_H8_D64_R32     0.07  True
 hf_kernels_rotary        cuda_B1_S2048_H32_D128_R64     0.26  True
 hf_kernels_rotary        cuda_B1_S2048_H32_D64_R32     0.09  True
 hf_kernels_rotary        cuda_B1_S2048_H8_D128_R64     0.09  True
@@ -4655,13 +4655,13 @@ hf_kernels_rotary        cuda_B2_S512_H8_D64_R32     0.09  True
 
▶ UV Install Logs
-
Fetching 5 files: 0%| | 0/5 [00:00<?, ?it/s]Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads. - -Fetching 5 files: 40%|████ | 2/5 [00:01<00:02, 1.08it/s] -Fetching 5 files: 100%|██████████| 5/5 [00:01<00:00, 2.71it/s]
+
Fetching 5 files: 0%| | 0/5 [00:00<?, ?it/s] +Fetching 5 files: 60%|██████ | 3/5 [00:00<00:00, 16.43it/s] +Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 12.18it/s] +Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 12.77it/s]

Artifacts:

rotary.jsonl diff --git a/rotary/impls/torch_rotary.html b/rotary/impls/torch_rotary.html index 64a3cde02ad217a13f512d0df2993207a495ec02..683e483f2cce3522cb1b647e3d028ea883610b2f 100644 --- a/rotary/impls/torch_rotary.html +++ b/rotary/impls/torch_rotary.html @@ -3904,7 +3904,7 @@ Cell: nv | 0.25s
-
Fri Dec 19 19:54:55 2025       
+
Fri Dec 19 23:00:45 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 580.105.08             Driver Version: 580.105.08     CUDA Version: 13.0     |
 +-----------------------------------------+------------------------+----------------------+
@@ -3913,7 +3913,7 @@ Cell: nv | 0.25s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   36C    P0             90W /  350W |       0MiB /  46068MiB |     17%      Default |
+| N/A   40C    P0             85W /  350W |       0MiB /  46068MiB |     24%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -3937,7 +3937,7 @@ Cell: nv | 0.25s
 ▼ output
  ▶ uv-logs
  | 
-Cell: benchmark | 7.93s
+Cell: benchmark | 7.89s
  | 
 
 Raw
@@ -4016,27 +4016,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.121ms      1256.65%       1.121ms       1.121ms             1  
-                                            torch_eager        12.91%     396.632us        99.49%       3.057ms       3.057ms       0.000us         0.00%      90.428us      90.428us             1  
-                                              aten::mul         6.11%     187.745us        10.64%     326.977us      13.624us      47.168us        52.87%      47.168us       1.965us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      47.168us        52.87%      47.168us       1.965us            24  
-                                            aten::copy_         3.73%     114.507us        64.51%       1.982ms     110.114us      28.957us        32.46%      30.173us       1.676us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.335us        25.04%      22.335us       1.861us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.087us        14.67%      13.087us       1.091us            12  
-                                            aten::clone         1.26%      38.732us        62.92%       1.933ms     322.183us       0.000us         0.00%       7.838us       1.306us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.622us         7.42%       6.622us       1.104us             6  
-                                              aten::sub         1.53%      47.130us         2.58%      79.281us      13.214us       6.559us         7.35%       6.559us       1.093us             6  
-                                              aten::add         1.27%      38.901us         2.21%      67.791us      11.299us       6.528us         7.32%       6.528us       1.088us             6  
-                                Activity Buffer Request        55.82%       1.715ms        55.82%       1.715ms       1.715ms       1.216us         1.36%       1.216us       1.216us             1  
-                                    aten::empty_strided         2.00%      61.422us         2.00%      61.422us      10.237us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         2.49%      76.434us         2.49%      76.434us      12.739us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.64%      81.233us         3.38%     103.763us       4.323us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.73%      22.530us         0.73%      22.530us       0.939us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         9.00%     276.368us         9.00%     276.368us       5.758us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.51%      15.650us         0.51%      15.650us      15.650us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.009ms      1132.93%       1.009ms       1.009ms             1  
+                                            torch_eager        12.41%     369.428us        99.52%       2.961ms       2.961ms       0.000us         0.00%      90.371us      90.371us             1  
+                                              aten::mul         5.71%     169.883us         9.58%     285.017us      11.876us      47.039us        52.80%      47.039us       1.960us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      47.039us        52.80%      47.039us       1.960us            24  
+                                            aten::copy_         3.61%     107.364us        66.50%       1.979ms     109.941us      28.963us        32.51%      30.243us       1.680us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.243us        24.97%      22.243us       1.854us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.089us        14.69%      13.089us       1.091us            12  
+                                            aten::clone         1.40%      41.760us        65.34%       1.944ms     324.049us       0.000us         0.00%       8.000us       1.333us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.720us         7.54%       6.720us       1.120us             6  
+                                              aten::sub         1.56%      46.400us         2.42%      72.061us      12.010us       6.561us         7.36%       6.561us       1.094us             6  
+                                              aten::add         1.27%      37.811us         2.01%      59.841us       9.973us       6.528us         7.33%       6.528us       1.088us             6  
+                                Activity Buffer Request        58.48%       1.740ms        58.48%       1.740ms       1.740ms       1.280us         1.44%       1.280us       1.280us             1  
+                                    aten::empty_strided         1.77%      52.541us         1.77%      52.541us       8.757us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         2.39%      71.252us         2.39%      71.252us      11.875us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.63%      78.291us         3.42%     101.892us       4.245us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.79%      23.601us         0.79%      23.601us       0.983us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         7.49%     222.745us         7.49%     222.745us       4.641us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.48%      14.390us         0.48%      14.390us      14.390us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 3.072ms
-Self CUDA time total: 89.212us
+Self CPU time total: 2.976ms
+Self CUDA time total: 89.091us
 
 
 
@@ -4046,27 +4046,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.007ms      1113.49%       1.007ms       1.007ms             1  
-                                            torch_eager        10.54%     298.489us        99.78%       2.827ms       2.827ms       0.000us         0.00%      91.582us      91.582us             1  
-                                              aten::mul         6.23%     176.544us        10.66%     301.947us      12.581us      47.549us        52.58%      47.549us       1.981us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      47.549us        52.58%      47.549us       1.981us            24  
-                                            aten::copy_         3.68%     104.131us        68.91%       1.952ms     108.446us      29.376us        32.49%      30.529us       1.696us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.591us        24.98%      22.591us       1.883us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.504us        14.93%      13.504us       1.125us            12  
-                                            aten::clone         0.77%      21.889us        65.76%       1.863ms     310.451us       0.000us         0.00%       7.938us       1.323us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.785us         7.50%       6.785us       1.131us             6  
-                                              aten::sub         1.41%      40.059us         2.39%      67.780us      11.297us       6.784us         7.50%       6.784us       1.131us             6  
-                                              aten::add         1.28%      36.151us         2.32%      65.853us      10.975us       6.720us         7.43%       6.720us       1.120us             6  
-                                Activity Buffer Request        60.61%       1.717ms        60.61%       1.717ms       1.717ms       1.153us         1.28%       1.153us       1.153us             1  
-                                    aten::empty_strided         1.12%      31.742us         1.12%      31.742us       5.290us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         2.04%      57.701us         2.04%      57.701us       9.617us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.45%      69.273us         3.06%      86.813us       3.617us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.62%      17.540us         0.62%      17.540us       0.731us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         9.05%     256.228us         9.05%     256.228us       5.338us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.22%       6.150us         0.22%       6.150us       6.150us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     914.978us      1014.29%     914.978us     914.978us             1  
+                                            torch_eager        10.68%     295.102us        99.80%       2.758ms       2.758ms       0.000us         0.00%      91.361us      91.361us             1  
+                                              aten::mul         5.65%     156.202us         9.56%     264.265us      11.011us      47.519us        52.68%      47.519us       1.980us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      47.519us        52.68%      47.519us       1.980us            24  
+                                            aten::copy_         3.57%      98.672us        70.68%       1.953ms     108.523us      29.314us        32.50%      30.466us       1.693us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.529us        24.97%      22.529us       1.877us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.376us        14.83%      13.376us       1.115us            12  
+                                            aten::clone         0.80%      22.200us        68.13%       1.883ms     313.805us       0.000us         0.00%       7.937us       1.323us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.785us         7.52%       6.785us       1.131us             6  
+                                              aten::add         1.12%      30.913us         1.88%      51.842us       8.640us       6.688us         7.41%       6.688us       1.115us             6  
+                                              aten::sub         1.32%      36.532us         2.17%      59.963us       9.994us       6.688us         7.41%       6.688us       1.115us             6  
+                                Activity Buffer Request        63.06%       1.743ms        63.06%       1.743ms       1.743ms       1.152us         1.28%       1.152us       1.152us             1  
+                                    aten::empty_strided         1.09%      30.200us         1.09%      30.200us       5.033us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         1.91%      52.752us         1.91%      52.752us       8.792us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.31%      63.802us         2.94%      81.195us       3.383us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.63%      17.393us         0.63%      17.393us       0.725us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         7.65%     211.533us         7.65%     211.533us       4.407us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.20%       5.491us         0.20%       5.491us       5.491us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.833ms
-Self CUDA time total: 90.429us
+Self CPU time total: 2.764ms
+Self CUDA time total: 90.209us
 
 
 
@@ -4076,27 +4076,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.006ms      1072.44%       1.006ms       1.006ms             1  
-                                            torch_eager        10.52%     298.395us        99.78%       2.831ms       2.831ms       0.000us         0.00%      95.037us      95.037us             1  
-                                              aten::mul         6.11%     173.334us        10.70%     303.588us      12.649us      48.703us        51.91%      48.703us       2.029us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      48.703us        51.91%      48.703us       2.029us            24  
-                                            aten::copy_         3.65%     103.459us        69.19%       1.963ms     109.044us      30.654us        32.67%      31.870us       1.771us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.975us        24.49%      22.975us       1.915us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.464us        15.42%      14.464us       1.205us            12  
-                                            aten::clone         0.76%      21.692us        65.97%       1.871ms     311.909us       0.000us         0.00%       8.895us       1.482us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.679us         8.18%       7.679us       1.280us             6  
-                                              aten::add         1.20%      34.161us         2.12%      60.242us      10.040us       7.233us         7.71%       7.233us       1.206us             6  
-                                              aten::sub         1.38%      39.061us         2.38%      67.492us      11.249us       7.231us         7.71%       7.231us       1.205us             6  
-                                Activity Buffer Request        60.90%       1.728ms        60.90%       1.728ms       1.728ms       1.216us         1.30%       1.216us       1.216us             1  
-                                    aten::empty_strided         1.08%      30.530us         1.08%      30.530us       5.088us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         2.06%      58.531us         2.06%      58.531us       9.755us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.41%      68.445us         3.03%      85.834us       3.576us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.61%      17.389us         0.61%      17.389us       0.725us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         9.10%     258.010us         9.10%     258.010us       5.375us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.22%       6.240us         0.22%       6.240us       6.240us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     896.092us       952.15%     896.092us     896.092us             1  
+                                            torch_eager        10.34%     284.358us        99.81%       2.744ms       2.744ms       0.000us         0.00%      95.424us      95.424us             1  
+                                              aten::mul         5.45%     149.791us         9.32%     256.084us      10.670us      48.736us        51.79%      48.736us       2.031us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      48.736us        51.79%      48.736us       2.031us            24  
+                                            aten::copy_         3.52%      96.699us        71.27%       1.959ms     108.841us      30.943us        32.88%      32.255us       1.792us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      23.071us        24.51%      23.071us       1.923us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.433us        15.34%      14.433us       1.203us            12  
+                                            aten::clone         0.75%      20.741us        68.58%       1.885ms     314.190us       0.000us         0.00%       9.184us       1.531us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.872us         8.36%       7.872us       1.312us             6  
+                                              aten::sub         1.32%      36.203us         2.15%      58.992us       9.832us       7.264us         7.72%       7.264us       1.211us             6  
+                                              aten::add         1.12%      30.822us         1.89%      51.942us       8.657us       7.169us         7.62%       7.169us       1.195us             6  
+                                Activity Buffer Request        63.70%       1.751ms        63.70%       1.751ms       1.751ms       1.312us         1.39%       1.312us       1.312us             1  
+                                    aten::empty_strided         1.08%      29.590us         1.08%      29.590us       4.932us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         1.85%      50.982us         1.85%      50.982us       8.497us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.36%      64.969us         3.01%      82.862us       3.453us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.65%      17.893us         0.65%      17.893us       0.746us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         7.66%     210.524us         7.66%     210.524us       4.386us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.19%       5.270us         0.19%       5.270us       5.270us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.837ms
-Self CUDA time total: 93.821us
+Self CPU time total: 2.749ms
+Self CUDA time total: 94.112us
 
 
 
@@ -4106,27 +4106,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.008ms       991.12%       1.008ms       1.008ms             1  
-                                            torch_eager        11.39%     291.220us        99.77%       2.552ms       2.552ms       0.000us         0.00%     103.069us     103.069us             1  
-                                              aten::mul         6.75%     172.580us        11.78%     301.232us      12.551us      52.797us        51.90%      52.797us       2.200us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      52.797us        51.90%      52.797us       2.200us            24  
-                                            aten::copy_         3.99%     102.084us        65.85%       1.684ms      93.565us      32.544us        31.99%      33.888us       1.883us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.736us        24.32%      24.736us       2.061us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.384us        16.11%      16.384us       1.365us            12  
-                                            aten::clone         0.85%      21.760us        62.39%       1.596ms     265.957us       0.000us         0.00%       9.152us       1.525us             6  
-                                              aten::sub         1.61%      41.170us         2.75%      70.342us      11.724us       8.256us         8.12%       8.256us       1.376us             6  
-                                              aten::add         1.37%      35.121us         2.54%      64.851us      10.809us       8.128us         7.99%       8.128us       1.355us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.808us         7.68%       7.808us       1.301us             6  
-                                Activity Buffer Request        49.27%       1.260ms        49.27%       1.260ms       1.260ms       1.344us         1.32%       1.344us       1.344us             1  
-                                    aten::empty_strided         1.23%      31.560us         1.23%      31.560us       5.260us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         9.74%     249.077us         9.74%     249.077us      41.513us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.69%      68.742us         3.39%      86.703us       3.613us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.70%      17.961us         0.70%      17.961us       0.748us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        10.18%     260.466us        10.18%     260.466us       5.426us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.23%       5.870us         0.23%       5.870us       5.870us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     914.202us       899.81%     914.202us     914.202us             1  
+                                            torch_eager         9.98%     278.657us        99.81%       2.786ms       2.786ms       0.000us         0.00%     102.911us     102.911us             1  
+                                              aten::mul         5.55%     155.014us         9.50%     265.176us      11.049us      52.799us        51.97%      52.799us       2.200us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      52.799us        51.97%      52.799us       2.200us            24  
+                                            aten::copy_         3.52%      98.370us        71.20%       1.987ms     110.396us      32.288us        31.78%      33.600us       1.867us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.544us        24.16%      24.544us       2.045us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.512us        16.25%      16.512us       1.376us            12  
+                                            aten::clone         0.72%      20.142us        68.59%       1.914ms     319.034us       0.000us         0.00%       9.056us       1.509us             6  
+                                              aten::sub         1.33%      37.171us         2.19%      61.241us      10.207us       8.320us         8.19%       8.320us       1.387us             6  
+                                              aten::add         1.42%      39.541us         2.20%      61.471us      10.245us       8.192us         8.06%       8.192us       1.365us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.744us         7.62%       7.744us       1.291us             6  
+                                Activity Buffer Request        57.78%       1.613ms        57.78%       1.613ms       1.613ms       1.312us         1.29%       1.312us       1.312us             1  
+                                    aten::empty_strided         1.10%      30.730us         1.10%      30.730us       5.122us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         7.81%     218.005us         7.81%     218.005us      36.334us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.28%      63.724us         2.90%      81.002us       3.375us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.62%      17.278us         0.62%      17.278us       0.720us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         7.68%     214.384us         7.68%     214.384us       4.466us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.19%       5.360us         0.19%       5.360us       5.360us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.558ms
-Self CUDA time total: 101.725us
+Self CPU time total: 2.791ms
+Self CUDA time total: 101.599us
 
 
 
@@ -4136,27 +4136,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.014ms      1081.02%       1.014ms       1.014ms             1  
-                                            torch_eager        10.25%     325.467us        99.80%       3.170ms       3.170ms       0.000us         0.00%      95.040us      95.040us             1  
-                                              aten::mul         5.52%     175.344us         9.66%     306.970us      12.790us      48.800us        52.05%      48.800us       2.033us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      48.800us        52.05%      48.800us       2.033us            24  
-                                            aten::copy_         3.31%     105.233us        71.23%       2.263ms     125.701us      30.784us        32.83%      32.064us       1.781us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      23.041us        24.57%      23.041us       1.920us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.176us        15.12%      14.176us       1.181us            12  
-                                            aten::clone         0.88%      27.881us        68.64%       2.180ms     363.408us       0.000us         0.00%       9.023us       1.504us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.743us         8.26%       7.743us       1.290us             6  
-                                              aten::sub         1.21%      38.440us         2.12%      67.480us      11.247us       7.136us         7.61%       7.136us       1.189us             6  
-                                              aten::add         1.04%      32.931us         1.90%      60.251us      10.042us       7.040us         7.51%       7.040us       1.173us             6  
-                                Activity Buffer Request        54.81%       1.741ms        54.81%       1.741ms       1.741ms       1.280us         1.37%       1.280us       1.280us             1  
-                                    aten::empty_strided         1.02%      32.521us         1.02%      32.521us       5.420us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        10.79%     342.870us        10.79%     342.870us      57.145us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.15%      68.383us         2.74%      86.883us       3.620us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.58%      18.500us         0.58%      18.500us       0.771us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.23%     261.446us         8.23%     261.446us       5.447us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.20%       6.450us         0.20%       6.450us       6.450us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     915.332us       974.61%     915.332us     915.332us             1  
+                                            torch_eager        10.01%     309.953us        99.82%       3.091ms       3.091ms       0.000us         0.00%      95.198us      95.198us             1  
+                                              aten::mul         4.96%     153.567us         8.55%     264.920us      11.038us      48.833us        52.00%      48.833us       2.035us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      48.833us        52.00%      48.833us       2.035us            24  
+                                            aten::copy_         3.18%      98.342us        72.96%       2.259ms     125.517us      30.876us        32.88%      32.156us       1.786us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      23.070us        24.56%      23.070us       1.923us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.209us        15.13%      14.209us       1.184us            12  
+                                            aten::clone         0.91%      28.102us        71.00%       2.199ms     366.430us       0.000us         0.00%       9.086us       1.514us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.806us         8.31%       7.806us       1.301us             6  
+                                              aten::sub         1.22%      37.870us         1.99%      61.740us      10.290us       7.168us         7.63%       7.168us       1.195us             6  
+                                              aten::add         0.97%      30.001us         1.65%      51.171us       8.529us       7.041us         7.50%       7.041us       1.173us             6  
+                                Activity Buffer Request        56.99%       1.765ms        56.99%       1.765ms       1.765ms       1.280us         1.36%       1.280us       1.280us             1  
+                                    aten::empty_strided         1.03%      31.751us         1.03%      31.751us       5.292us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        10.91%     337.859us        10.91%     337.859us      56.310us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.16%      66.933us         2.72%      84.261us       3.511us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.56%      17.328us         0.56%      17.328us       0.722us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         6.93%     214.663us         6.93%     214.663us       4.472us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.18%       5.580us         0.18%       5.580us       5.580us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 3.177ms
-Self CUDA time total: 93.760us
+Self CPU time total: 3.097ms
+Self CUDA time total: 93.918us
 
 
 
@@ -4166,26 +4166,26 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.001ms       983.06%       1.001ms       1.001ms             1  
-                                            torch_eager         9.84%     295.255us        99.81%       2.995ms       2.995ms       0.000us         0.00%     103.136us     103.136us             1  
-                                              aten::mul         5.72%     171.754us         9.96%     298.939us      12.456us      52.896us        51.95%      52.896us       2.204us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      52.896us        51.95%      52.896us       2.204us            24  
-                                            aten::copy_         3.46%     103.695us        71.00%       2.131ms     118.374us      32.417us        31.84%      33.729us       1.874us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.673us        24.23%      24.673us       2.056us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.511us        16.22%      16.511us       1.376us            12  
-                                            aten::clone         0.71%      21.410us        67.93%       2.039ms     339.780us       0.000us         0.00%       9.056us       1.509us             6  
-                                              aten::sub         1.35%      40.422us         2.36%      70.682us      11.780us       8.319us         8.17%       8.319us       1.386us             6  
-                                              aten::add         1.19%      35.599us         2.15%      64.481us      10.747us       8.192us         8.05%       8.192us       1.365us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.744us         7.61%       7.744us       1.291us             6  
-                                Activity Buffer Request        57.66%       1.731ms        57.66%       1.731ms       1.731ms       1.312us         1.29%       1.312us       1.312us             1  
-                                    aten::empty_strided         1.04%      31.291us         1.04%      31.291us       5.215us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         7.41%     222.325us         7.41%     222.325us      37.054us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.17%      65.101us         2.75%      82.681us       3.445us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.59%      17.580us         0.59%      17.580us       0.733us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.68%     260.519us         8.68%     260.519us       5.427us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.19%       5.671us         0.19%       5.671us       5.671us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     991.450us       973.69%     991.450us     991.450us             1  
+                                            torch_eager        12.00%     374.920us        99.82%       3.119ms       3.119ms       0.000us         0.00%     103.167us     103.167us             1  
+                                              aten::mul         4.76%     148.675us         8.21%     256.446us      10.685us      52.898us        51.95%      52.898us       2.204us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      52.898us        51.95%      52.898us       2.204us            24  
+                                            aten::copy_         3.10%      96.880us        71.33%       2.229ms     123.830us      32.414us        31.83%      33.757us       1.875us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.575us        24.13%      24.575us       2.048us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.512us        16.22%      16.512us       1.376us            12  
+                                            aten::clone         0.70%      21.890us        69.11%       2.160ms     359.941us       0.000us         0.00%       9.182us       1.530us             6  
+                                              aten::sub         1.16%      36.128us         1.91%      59.601us       9.934us       8.320us         8.17%       8.320us       1.387us             6  
+                                              aten::add         1.18%      36.821us         1.99%      62.122us      10.354us       8.192us         8.05%       8.192us       1.365us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.839us         7.70%       7.839us       1.307us             6  
+                                Activity Buffer Request        56.19%       1.756ms        56.19%       1.756ms       1.756ms       1.343us         1.32%       1.343us       1.343us             1  
+                                    aten::empty_strided         0.98%      30.741us         0.98%      30.741us       5.124us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        10.19%     318.287us        10.19%     318.287us      53.048us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.12%      66.300us         2.71%      84.551us       3.523us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.58%      18.251us         0.58%      18.251us       0.760us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         6.87%     214.557us         6.87%     214.557us       4.470us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.18%       5.580us         0.18%       5.580us       5.580us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 3.001ms
+Self CPU time total: 3.125ms
 Self CUDA time total: 101.824us
 
 
@@ -4196,27 +4196,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.015ms       840.95%       1.015ms       1.015ms             1  
-                                            torch_eager         9.74%     291.664us        99.80%       2.987ms       2.987ms       0.000us         0.00%     122.530us     122.530us             1  
-                                              aten::mul         5.84%     174.903us        10.28%     307.588us      12.816us      62.144us        51.48%      62.144us       2.589us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      62.144us        51.48%      62.144us       2.589us            24  
-                                            aten::copy_         3.43%     102.713us        70.67%       2.115ms     117.501us      39.265us        32.53%      41.090us       2.283us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.864us        23.91%      28.864us       2.405us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.296us        15.99%      19.296us       1.608us            12  
-                                            aten::clone         0.71%      21.240us        67.46%       2.019ms     336.492us       0.000us         0.00%      12.226us       2.038us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.401us         8.62%      10.401us       1.733us             6  
-                                              aten::add         1.16%      34.581us         2.15%      64.442us      10.740us       9.696us         8.03%       9.696us       1.616us             6  
-                                              aten::sub         1.36%      40.831us         2.36%      70.613us      11.769us       9.600us         7.95%       9.600us       1.600us             6  
-                                Activity Buffer Request        57.99%       1.736ms        57.99%       1.736ms       1.736ms       1.825us         1.51%       1.825us       1.825us             1  
-                                    aten::empty_strided         1.04%      31.082us         1.04%      31.082us       5.180us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         6.60%     197.545us         6.60%     197.545us      32.924us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.24%      66.920us         2.85%      85.310us       3.555us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.61%      18.390us         0.61%      18.390us       0.766us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         9.07%     271.392us         9.07%     271.392us       5.654us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.20%       6.001us         0.20%       6.001us       6.001us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     898.909us       743.56%     898.909us     898.909us             1  
+                                            torch_eager         9.91%     292.126us        99.82%       2.942ms       2.942ms       0.000us         0.00%     122.717us     122.717us             1  
+                                              aten::mul         5.16%     152.179us         8.78%     258.933us      10.789us      62.079us        51.35%      62.079us       2.587us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      62.079us        51.35%      62.079us       2.587us            24  
+                                            aten::copy_         3.24%      95.439us        72.74%       2.144ms     119.116us      39.487us        32.66%      41.311us       2.295us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.927us        23.93%      28.927us       2.411us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.327us        15.99%      19.327us       1.611us            12  
+                                            aten::clone         0.77%      22.660us        70.42%       2.076ms     345.928us       0.000us         0.00%      12.384us       2.064us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.560us         8.73%      10.560us       1.760us             6  
+                                              aten::sub         1.22%      36.000us         2.00%      58.931us       9.822us       9.664us         7.99%       9.664us       1.611us             6  
+                                              aten::add         1.08%      31.800us         1.80%      53.111us       8.852us       9.663us         7.99%       9.663us       1.611us             6  
+                                Activity Buffer Request        57.43%       1.693ms        57.43%       1.693ms       1.693ms       1.824us         1.51%       1.824us       1.824us             1  
+                                    aten::empty_strided         1.01%      29.882us         1.01%      29.882us       4.980us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        10.11%     297.916us        10.11%     297.916us      49.653us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.20%      64.806us         2.80%      82.537us       3.439us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.60%      17.731us         0.60%      17.731us       0.739us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         7.09%     208.898us         7.09%     208.898us       4.352us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.18%       5.241us         0.18%       5.241us       5.241us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.993ms
-Self CUDA time total: 120.705us
+Self CPU time total: 2.948ms
+Self CUDA time total: 120.893us
 
 
 
@@ -4226,27 +4226,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.002ms       581.09%       1.002ms       1.002ms             1  
-                                            torch_eager        20.60%     295.691us        99.58%       1.430ms       1.430ms       0.000us         0.00%     175.327us     175.327us             1  
-                                              aten::mul        11.87%     170.476us        20.75%     297.916us      12.413us      89.503us        51.89%      89.503us       3.729us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      89.503us        51.89%      89.503us       3.729us            24  
-                                            aten::copy_         7.24%     103.879us        39.87%     572.479us      31.804us      57.887us        33.56%      60.735us       3.374us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.832us        23.67%      40.832us       3.403us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      25.089us        14.55%      25.089us       2.091us            12  
-                                            aten::clone         1.51%      21.660us        33.62%     482.661us      80.443us       0.000us         0.00%      19.903us       3.317us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.055us         9.89%      17.055us       2.842us             6  
-                                              aten::sub         2.80%      40.184us         4.70%      67.454us      11.242us      12.546us         7.27%      12.546us       2.091us             6  
-                                              aten::add         2.41%      34.531us         4.25%      60.992us      10.165us      12.543us         7.27%      12.543us       2.090us             6  
-                                Activity Buffer Request        14.04%     201.545us        14.04%     201.545us     201.545us       2.848us         1.65%       2.848us       2.848us             1  
-                                    aten::empty_strided         2.07%      29.712us         2.07%      29.712us       4.952us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        13.60%     195.253us        13.60%     195.253us      32.542us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.57%      65.572us         5.83%      83.731us       3.489us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.26%      18.159us         1.26%      18.159us       0.757us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        17.62%     252.973us        17.62%     252.973us       5.270us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.42%       6.061us         0.42%       6.061us       6.061us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     904.612us       526.12%     904.612us     904.612us             1  
+                                            torch_eager        19.42%     290.431us        99.64%       1.490ms       1.490ms       0.000us         0.00%     174.789us     174.789us             1  
+                                              aten::mul        10.19%     152.404us        17.63%     263.706us      10.988us      89.218us        51.89%      89.218us       3.717us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      89.218us        51.89%      89.218us       3.717us            24  
+                                            aten::copy_         6.40%      95.669us        46.12%     689.622us      38.312us      57.665us        33.54%      60.513us       3.362us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.577us        23.60%      40.577us       3.381us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      25.058us        14.57%      25.058us       2.088us            12  
+                                            aten::clone         1.32%      19.751us        41.22%     616.413us     102.735us       0.000us         0.00%      19.936us       3.323us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.088us         9.94%      17.088us       2.848us             6  
+                                              aten::add         2.23%      33.380us         3.73%      55.850us       9.308us      12.577us         7.31%      12.577us       2.096us             6  
+                                              aten::sub         2.48%      37.041us         4.08%      60.981us      10.164us      12.481us         7.26%      12.481us       2.080us             6  
+                                Activity Buffer Request        17.03%     254.616us        17.03%     254.616us     254.616us       2.848us         1.66%       2.848us       2.848us             1  
+                                    aten::empty_strided         1.98%      29.551us         1.98%      29.551us       4.925us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        18.81%     281.226us        18.81%     281.226us      46.871us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.20%      62.788us         5.36%      80.180us       3.341us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.16%      17.392us         1.16%      17.392us       0.725us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        14.43%     215.823us        14.43%     215.823us       4.496us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.36%       5.360us         0.36%       5.360us       5.360us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.436ms
-Self CUDA time total: 172.479us
+Self CPU time total: 1.495ms
+Self CUDA time total: 171.941us
 
 
 
@@ -4256,27 +4256,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     998.580us       827.30%     998.580us     998.580us             1  
-                                            torch_eager        20.47%     288.290us        99.58%       1.403ms       1.403ms       0.000us         0.00%     122.464us     122.464us             1  
-                                              aten::mul        12.45%     175.343us        21.53%     303.228us      12.634us      62.175us        51.51%      62.175us       2.591us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      62.175us        51.51%      62.175us       2.591us            24  
-                                            aten::copy_         7.38%     103.973us        38.88%     547.625us      30.424us      39.265us        32.53%      41.025us       2.279us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.800us        23.86%      28.800us       2.400us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.264us        15.96%      19.264us       1.605us            12  
-                                            aten::clone         1.40%      19.710us        32.30%     454.921us      75.820us       0.000us         0.00%      12.225us       2.038us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.465us         8.67%      10.465us       1.744us             6  
-                                              aten::add         2.42%      34.049us         4.36%      61.370us      10.228us       9.695us         8.03%       9.695us       1.616us             6  
-                                              aten::sub         2.76%      38.918us         4.74%      66.750us      11.125us       9.569us         7.93%       9.569us       1.595us             6  
-                                Activity Buffer Request        13.00%     183.145us        13.00%     183.145us     183.145us       1.760us         1.46%       1.760us       1.760us             1  
-                                    aten::empty_strided         2.13%      30.010us         2.13%      30.010us       5.002us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        13.33%     187.794us        13.33%     187.794us      31.299us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.79%      67.456us         6.09%      85.722us       3.572us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.30%      18.266us         1.30%      18.266us       0.761us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        18.16%     255.751us        18.16%     255.751us       5.328us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.42%       5.851us         0.42%       5.851us       5.851us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     891.715us       739.56%     891.715us     891.715us             1  
+                                            torch_eager        18.90%     276.136us        99.61%       1.455ms       1.455ms       0.000us         0.00%     122.334us     122.334us             1  
+                                              aten::mul        10.73%     156.714us        18.01%     263.146us      10.964us      61.921us        51.36%      61.921us       2.580us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      61.921us        51.36%      61.921us       2.580us            24  
+                                            aten::copy_         6.60%      96.484us        46.13%     674.035us      37.446us      39.325us        32.61%      41.085us       2.283us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.894us        23.96%      28.894us       2.408us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.328us        16.03%      19.328us       1.611us            12  
+                                            aten::clone         1.42%      20.789us        41.25%     602.732us     100.455us       0.000us         0.00%      12.191us       2.032us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.431us         8.65%      10.431us       1.738us             6  
+                                              aten::add         2.19%      32.032us         3.63%      53.102us       8.850us       9.696us         8.04%       9.696us       1.616us             6  
+                                              aten::sub         2.44%      35.611us         4.02%      58.732us       9.789us       9.632us         7.99%       9.632us       1.605us             6  
+                                Activity Buffer Request        17.44%     254.736us        17.44%     254.736us     254.736us       1.760us         1.46%       1.760us       1.760us             1  
+                                    aten::empty_strided         2.08%      30.391us         2.08%      30.391us       5.065us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        18.06%     263.885us        18.06%     263.885us      43.981us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.23%      61.792us         5.41%      79.021us       3.293us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.18%      17.229us         1.18%      17.229us       0.718us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        14.34%     209.553us        14.34%     209.553us       4.366us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.39%       5.690us         0.39%       5.690us       5.690us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.409ms
-Self CUDA time total: 120.704us
+Self CPU time total: 1.461ms
+Self CUDA time total: 120.574us
 
 
 
@@ -4286,27 +4286,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     990.551us       575.17%     990.551us     990.551us             1  
-                                            torch_eager         9.55%     281.107us        99.81%       2.938ms       2.938ms       0.000us         0.00%     175.100us     175.100us             1  
-                                              aten::mul         5.81%     170.942us        10.23%     301.167us      12.549us      89.468us        51.95%      89.468us       3.728us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      89.468us        51.95%      89.468us       3.728us            24  
-                                            aten::copy_         3.50%     102.971us        70.92%       2.088ms     115.984us      57.728us        33.52%      60.608us       3.367us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.800us        23.69%      40.800us       3.400us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      25.024us        14.53%      25.024us       2.085us            12  
-                                            aten::clone         0.79%      23.370us        68.06%       2.004ms     333.942us       0.000us         0.00%      19.808us       3.301us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.928us         9.83%      16.928us       2.821us             6  
-                                              aten::add         1.19%      35.020us         2.08%      61.181us      10.197us      12.513us         7.27%      12.513us       2.085us             6  
-                                              aten::sub         1.27%      37.451us         2.29%      67.451us      11.242us      12.511us         7.26%      12.511us       2.085us             6  
-                                Activity Buffer Request        58.47%       1.721ms        58.47%       1.721ms       1.721ms       2.880us         1.67%       2.880us       2.880us             1  
-                                    aten::empty_strided         1.04%      30.722us         1.04%      30.722us       5.120us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         6.59%     193.904us         6.59%     193.904us      32.317us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.29%      67.323us         2.91%      85.623us       3.568us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.62%      18.300us         0.62%      18.300us       0.762us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.69%     255.869us         8.69%     255.869us       5.331us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.19%       5.631us         0.19%       5.631us       5.631us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     898.269us       521.86%     898.269us     898.269us             1  
+                                            torch_eager         9.37%     284.685us        99.81%       3.032ms       3.032ms       0.000us         0.00%     174.974us     174.974us             1  
+                                              aten::mul         4.95%     150.441us         8.39%     254.798us      10.617us      89.503us        52.00%      89.503us       3.729us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      89.503us        52.00%      89.503us       3.729us            24  
+                                            aten::copy_         3.17%      96.407us        73.91%       2.246ms     124.750us      57.409us        33.35%      60.256us       3.348us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.513us        23.54%      40.513us       3.376us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      25.215us        14.65%      25.215us       2.101us            12  
+                                            aten::clone         0.72%      21.929us        71.62%       2.176ms     362.673us       0.000us         0.00%      19.743us       3.291us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.896us         9.82%      16.896us       2.816us             6  
+                                              aten::add         1.01%      30.763us         1.71%      51.873us       8.645us      12.671us         7.36%      12.671us       2.112us             6  
+                                              aten::sub         1.24%      37.781us         2.01%      61.191us      10.198us      12.544us         7.29%      12.544us       2.091us             6  
+                                Activity Buffer Request        60.27%       1.831ms        60.27%       1.831ms       1.831ms       2.847us         1.65%       2.847us       2.847us             1  
+                                    aten::empty_strided         0.99%      30.010us         0.99%      30.010us       5.002us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         8.59%     260.978us         8.59%     260.978us      43.496us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.13%      64.731us         2.72%      82.502us       3.438us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.58%      17.771us         0.58%      17.771us       0.740us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         6.78%     205.930us         6.78%     205.930us       4.290us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.19%       5.660us         0.19%       5.660us       5.660us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.944ms
-Self CUDA time total: 172.220us
+Self CPU time total: 3.038ms
+Self CUDA time total: 172.127us
 
 
 
@@ -4316,27 +4316,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     996.043us       348.79%     996.043us     996.043us             1  
-                                            torch_eager        20.87%     290.654us        99.57%       1.387ms       1.387ms       0.000us         0.00%     304.030us     304.030us             1  
-                                              aten::mul        12.33%     171.709us        21.56%     300.326us      12.514us     133.152us        46.63%     133.152us       5.548us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     133.152us        46.63%     133.152us       5.548us            24  
-                                            aten::copy_         7.20%     100.319us        38.33%     533.940us      29.663us     111.295us        38.97%     129.758us       7.209us            18  
-                                            aten::clone         1.43%      19.920us        31.74%     442.109us      73.685us       0.000us         0.00%      72.510us      12.085us             6  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.248us        20.05%      57.248us       4.771us            12  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      54.047us        18.93%      54.047us       9.008us             6  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      41.120us        14.40%      41.120us       3.427us            12  
-                                              aten::sub         2.81%      39.111us         4.76%      66.281us      11.047us      20.672us         7.24%      20.672us       3.445us             6  
-                                              aten::add         2.49%      34.670us         4.42%      61.560us      10.260us      20.448us         7.16%      20.448us       3.408us             6  
-                                Activity Buffer Request        12.17%     169.484us        12.17%     169.484us     169.484us      18.463us         6.47%      18.463us      18.463us             1  
-                                    aten::empty_strided         2.13%      29.640us         2.13%      29.640us       4.940us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        13.75%     191.524us        13.75%     191.524us      31.921us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.84%      67.441us         6.08%      84.703us       3.529us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.24%      17.262us         1.24%      17.262us       0.719us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        18.33%     255.290us        18.33%     255.290us       5.319us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.43%       5.940us         0.43%       5.940us       5.940us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     909.882us       318.77%     909.882us     909.882us             1  
+                                            torch_eager        19.71%     283.050us        99.60%       1.431ms       1.431ms       0.000us         0.00%     303.835us     303.835us             1  
+                                              aten::mul        10.48%     150.585us        17.83%     256.036us      10.668us     133.276us        46.69%     133.276us       5.553us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     133.276us        46.69%     133.276us       5.553us            24  
+                                            aten::copy_         6.71%      96.363us        44.37%     637.354us      35.409us     111.072us        38.91%     129.472us       7.193us            18  
+                                            aten::clone         1.41%      20.282us        39.28%     564.182us      94.030us       0.000us         0.00%      72.352us      12.059us             6  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.120us        20.01%      57.120us       4.760us            12  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      53.952us        18.90%      53.952us       8.992us             6  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      41.087us        14.39%      41.087us       3.424us            12  
+                                              aten::sub         3.01%      43.193us         4.71%      67.582us      11.264us      20.671us         7.24%      20.671us       3.445us             6  
+                                              aten::add         2.32%      33.393us         3.83%      55.073us       9.179us      20.416us         7.15%      20.416us       3.403us             6  
+                                Activity Buffer Request        16.10%     231.245us        16.10%     231.245us     231.245us      18.400us         6.45%      18.400us      18.400us             1  
+                                    aten::empty_strided         2.13%      30.620us         2.13%      30.620us       5.103us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        17.40%     249.916us        17.40%     249.916us      41.653us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.43%      63.582us         5.61%      80.624us       3.359us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.19%      17.042us         1.19%      17.042us       0.710us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        14.71%     211.350us        14.71%     211.350us       4.403us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.40%       5.701us         0.40%       5.701us       5.701us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.393ms
-Self CUDA time total: 285.567us
+Self CPU time total: 1.436ms
+Self CUDA time total: 285.435us
 
 
 
@@ -4346,27 +4346,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.018ms       178.84%       1.018ms       1.018ms             1  
-                                            torch_eager        20.93%     294.497us        99.60%       1.402ms       1.402ms       0.000us         0.00%     592.727us     592.727us             1  
-                                            aten::copy_         7.21%     101.432us        37.46%     527.132us      29.285us     273.662us        48.09%     297.374us      16.521us            18  
-                                              aten::mul        12.77%     179.663us        21.96%     309.058us      12.877us     229.816us        40.39%     229.816us       9.576us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     229.816us        40.39%     229.816us       9.576us            24  
-                                            aten::clone         1.44%      20.252us        30.89%     434.772us      72.462us       0.000us         0.00%     206.559us      34.426us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     182.847us        32.13%     182.847us      30.474us             6  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      90.815us        15.96%      90.815us       7.568us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      65.537us        11.52%      65.537us       5.461us            12  
-                                              aten::sub         2.76%      38.901us         5.06%      71.272us      11.879us      32.865us         5.78%      32.865us       5.477us             6  
-                                              aten::add         2.62%      36.830us         4.51%      63.532us      10.589us      32.672us         5.74%      32.672us       5.445us             6  
-                                Activity Buffer Request        11.90%     167.504us        11.90%     167.504us     167.504us      23.712us         4.17%      23.712us      23.712us             1  
-                                    aten::empty_strided         2.15%      30.291us         2.15%      30.291us       5.049us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        13.10%     184.385us        13.10%     184.385us      30.731us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.77%      67.152us         6.08%      85.571us       3.565us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.31%      18.419us         1.31%      18.419us       0.767us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        18.64%     262.279us        18.64%     262.279us       5.464us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.40%       5.670us         0.40%       5.670us       5.670us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     912.081us       160.07%     912.081us     912.081us             1  
+                                            torch_eager        19.62%     281.157us        99.59%       1.427ms       1.427ms       0.000us         0.00%     593.529us     593.529us             1  
+                                            aten::copy_         6.77%      97.080us        44.43%     636.741us      35.375us     273.659us        48.03%     297.370us      16.521us            18  
+                                              aten::mul        10.89%     156.021us        18.62%     266.885us      11.120us     230.237us        40.41%     230.237us       9.593us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     230.237us        40.41%     230.237us       9.593us            24  
+                                            aten::clone         1.32%      18.940us        39.02%     559.152us      93.192us       0.000us         0.00%     206.843us      34.474us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     183.132us        32.14%     183.132us      30.522us             6  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      90.527us        15.89%      90.527us       7.544us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      65.922us        11.57%      65.922us       5.494us            12  
+                                              aten::sub         2.63%      37.723us         4.26%      61.043us      10.174us      33.089us         5.81%      33.089us       5.515us             6  
+                                              aten::add         2.15%      30.871us         3.64%      52.150us       8.692us      32.833us         5.76%      32.833us       5.472us             6  
+                                Activity Buffer Request        16.39%     234.945us        16.39%     234.945us     234.945us      23.711us         4.16%      23.711us      23.711us             1  
+                                    aten::empty_strided         2.06%      29.522us         2.06%      29.522us       4.920us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        17.05%     244.375us        17.05%     244.375us      40.729us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.41%      63.274us         5.65%      80.913us       3.371us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.23%      17.639us         1.23%      17.639us       0.735us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        15.06%     215.804us        15.06%     215.804us       4.496us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.41%       5.810us         0.41%       5.810us       5.810us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.407ms
-Self CUDA time total: 569.015us
+Self CPU time total: 1.433ms
+Self CUDA time total: 569.818us
 
 
 
@@ -4376,27 +4376,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.000ms      1080.76%       1.000ms       1.000ms             1  
-                                            torch_eager         9.93%     294.202us        99.82%       2.956ms       2.956ms       0.000us         0.00%      93.659us      93.659us             1  
-                                              aten::mul         5.79%     171.584us        10.10%     299.275us      12.470us      49.565us        53.56%      49.565us       2.065us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      49.565us        53.56%      49.565us       2.065us            24  
-                                            aten::copy_         3.59%     106.321us        70.88%       2.099ms     116.627us      29.406us        31.78%      30.526us       1.696us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.623us        24.45%      22.623us       1.885us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.568us        14.66%      13.568us       1.131us            12  
-                                            aten::clone         0.71%      20.970us        67.84%       2.009ms     334.883us       0.000us         0.00%       7.903us       1.317us             6  
-                                              aten::sub         1.35%      39.881us         2.33%      69.031us      11.505us       6.784us         7.33%       6.784us       1.131us             6  
-                                              aten::add         1.15%      34.189us         1.99%      58.861us       9.810us       6.784us         7.33%       6.784us       1.131us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.783us         7.33%       6.783us       1.130us             6  
-                                Activity Buffer Request        58.33%       1.728ms        58.33%       1.728ms       1.728ms       1.120us         1.21%       1.120us       1.120us             1  
-                                    aten::empty_strided         1.04%      30.880us         1.04%      30.880us       5.147us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         6.53%     193.506us         6.53%     193.506us      32.251us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.25%      66.634us         2.83%      83.844us       3.493us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.58%      17.210us         0.58%      17.210us       0.717us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.56%     253.394us         8.56%     253.394us       5.279us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.18%       5.350us         0.18%       5.350us       5.350us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     982.303us      1061.08%     982.303us     982.303us             1  
+                                            torch_eager         9.91%     295.071us        99.80%       2.971ms       2.971ms       0.000us         0.00%      93.696us      93.696us             1  
+                                              aten::mul         5.34%     158.961us         9.11%     271.063us      11.294us      49.631us        53.61%      49.631us       2.068us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      49.631us        53.61%      49.631us       2.068us            24  
+                                            aten::copy_         3.28%      97.679us        70.68%       2.104ms     116.897us      29.410us        31.77%      30.530us       1.696us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.593us        24.40%      22.593us       1.883us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.535us        14.62%      13.535us       1.128us            12  
+                                            aten::clone         0.75%      22.189us        68.29%       2.033ms     338.797us       0.000us         0.00%       7.937us       1.323us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.817us         7.36%       6.817us       1.136us             6  
+                                              aten::sub         1.25%      37.104us         2.05%      60.974us      10.162us       6.784us         7.33%       6.784us       1.131us             6  
+                                              aten::add         1.06%      31.689us         1.78%      52.921us       8.820us       6.751us         7.29%       6.751us       1.125us             6  
+                                Activity Buffer Request        57.10%       1.700ms        57.10%       1.700ms       1.700ms       1.120us         1.21%       1.120us       1.120us             1  
+                                    aten::empty_strided         1.03%      30.660us         1.03%      30.660us       5.110us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         8.33%     247.996us         8.33%     247.996us      41.333us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.24%      66.630us         4.50%     133.983us       5.583us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         2.26%      67.353us         2.26%      67.353us       2.806us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         7.25%     215.936us         7.25%     215.936us       4.499us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.20%       5.820us         0.20%       5.820us       5.820us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.962ms
-Self CUDA time total: 92.539us
+Self CPU time total: 2.977ms
+Self CUDA time total: 92.576us
 
 
 
@@ -4406,27 +4406,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.005ms      1046.18%       1.005ms       1.005ms             1  
-                                            torch_eager        20.70%     290.067us        99.57%       1.395ms       1.395ms       0.000us         0.00%      97.374us      97.374us             1  
-                                              aten::mul        12.93%     181.162us        22.03%     308.587us      12.858us      51.138us        53.23%      51.138us       2.131us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      51.138us        53.23%      51.138us       2.131us            24  
-                                            aten::copy_         7.20%     100.934us        37.57%     526.405us      29.245us      30.750us        32.01%      32.061us       1.781us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.975us        23.92%      22.975us       1.915us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.175us        14.76%      14.175us       1.181us            12  
-                                            aten::clone         1.53%      21.448us        31.30%     438.559us      73.093us       0.000us         0.00%       9.086us       1.514us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.775us         8.09%       7.775us       1.296us             6  
-                                              aten::sub         2.81%      39.410us         4.84%      67.761us      11.294us       7.103us         7.39%       7.103us       1.184us             6  
-                                              aten::add         2.52%      35.299us         4.51%      63.202us      10.534us       7.072us         7.36%       7.072us       1.179us             6  
-                                Activity Buffer Request        11.90%     166.735us        11.90%     166.735us     166.735us       1.311us         1.36%       1.311us       1.311us             1  
-                                    aten::empty_strided         2.22%      31.071us         2.22%      31.071us       5.178us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        13.33%     186.813us        13.33%     186.813us      31.136us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.90%      68.622us         6.17%      86.384us       3.599us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.27%      17.762us         1.27%      17.762us       0.740us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        18.24%     255.602us        18.24%     255.602us       5.325us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.43%       6.050us         0.43%       6.050us       6.050us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     884.987us       920.94%     884.987us     884.987us             1  
+                                            torch_eager        19.85%     274.487us        99.61%       1.377ms       1.377ms       0.000us         0.00%      97.408us      97.408us             1  
+                                              aten::mul        10.71%     148.094us        18.37%     254.035us      10.585us      51.231us        53.31%      51.231us       2.135us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      51.231us        53.31%      51.231us       2.135us            24  
+                                            aten::copy_         6.85%      94.742us        43.76%     605.074us      33.615us      30.720us        31.97%      32.032us       1.780us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.881us        23.81%      22.881us       1.907us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.145us        14.72%      14.145us       1.179us            12  
+                                            aten::clone         1.40%      19.328us        38.56%     533.201us      88.867us       0.000us         0.00%       9.151us       1.525us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.839us         8.16%       7.839us       1.306us             6  
+                                              aten::sub         2.67%      36.902us         4.36%      60.321us      10.054us       7.073us         7.36%       7.073us       1.179us             6  
+                                              aten::add         2.24%      30.942us         3.77%      52.122us       8.687us       7.072us         7.36%       7.072us       1.179us             6  
+                                Activity Buffer Request        15.48%     213.995us        15.48%     213.995us     213.995us       1.312us         1.37%       1.312us       1.312us             1  
+                                    aten::empty_strided         2.17%      30.062us         2.17%      30.062us       5.010us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        17.28%     238.866us        17.28%     238.866us      39.811us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.64%      64.092us         5.92%      81.901us       3.413us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.29%      17.809us         1.29%      17.809us       0.742us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        15.04%     208.011us        15.04%     208.011us       4.334us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.39%       5.351us         0.39%       5.351us       5.351us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.401ms
-Self CUDA time total: 96.063us
+Self CPU time total: 1.383ms
+Self CUDA time total: 96.096us
 
 
 
@@ -4436,27 +4436,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.008ms       966.66%       1.008ms       1.008ms             1  
-                                            torch_eager        19.87%     287.726us        99.59%       1.442ms       1.442ms       0.000us         0.00%     105.564us     105.564us             1  
-                                              aten::mul        12.48%     180.745us        21.62%     312.998us      13.042us      55.327us        53.07%      55.327us       2.305us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      55.327us        53.07%      55.327us       2.305us            24  
-                                            aten::copy_         7.05%     102.025us        39.57%     572.964us      31.831us      32.416us        31.09%      33.727us       1.874us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.737us        23.73%      24.737us       2.061us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.510us        15.84%      16.510us       1.376us            12  
-                                            aten::clone         1.40%      20.271us        33.26%     481.521us      80.254us       0.000us         0.00%       8.990us       1.498us             6  
-                                              aten::sub         2.64%      38.220us         4.70%      68.062us      11.344us       8.351us         8.01%       8.351us       1.392us             6  
-                                              aten::add         2.49%      36.083us         4.34%      62.773us      10.462us       8.159us         7.83%       8.159us       1.360us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.679us         7.37%       7.679us       1.280us             6  
-                                Activity Buffer Request        13.36%     193.394us        13.36%     193.394us     193.394us       1.311us         1.26%       1.311us       1.311us             1  
-                                    aten::empty_strided         2.11%      30.520us         2.11%      30.520us       5.087us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        14.10%     204.105us        14.10%     204.105us      34.017us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.72%      68.322us         5.99%      86.681us       3.612us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.27%      18.359us         1.27%      18.359us       0.765us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        18.11%     262.225us        18.11%     262.225us       5.463us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.41%       5.880us         0.41%       5.880us       5.880us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     884.827us       849.48%     884.827us     884.827us             1  
+                                            torch_eager        19.57%     271.768us        99.62%       1.383ms       1.383ms       0.000us         0.00%     105.505us     105.505us             1  
+                                              aten::mul        10.84%     150.541us        18.55%     257.594us      10.733us      55.295us        53.09%      55.295us       2.304us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      55.295us        53.09%      55.295us       2.304us            24  
+                                            aten::copy_         7.04%      97.750us        44.26%     614.622us      34.146us      32.417us        31.12%      33.761us       1.876us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.609us        23.63%      24.609us       2.051us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.449us        15.79%      16.449us       1.371us            12  
+                                            aten::clone         1.38%      19.214us        38.80%     538.794us      89.799us       0.000us         0.00%       9.152us       1.525us             6  
+                                              aten::sub         2.56%      35.499us         4.18%      58.050us       9.675us       8.289us         7.96%       8.289us       1.381us             6  
+                                              aten::add         2.19%      30.379us         3.71%      51.460us       8.577us       8.160us         7.83%       8.160us       1.360us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.808us         7.50%       7.808us       1.301us             6  
+                                Activity Buffer Request        16.03%     222.545us        16.03%     222.545us     222.545us       1.344us         1.29%       1.344us       1.344us             1  
+                                    aten::empty_strided         2.12%      29.421us         2.12%      29.421us       4.904us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        16.95%     235.404us        16.95%     235.404us      39.234us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.56%      63.330us         5.86%      81.351us       3.390us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.30%      18.021us         1.30%      18.021us       0.751us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        15.09%     209.608us        15.09%     209.608us       4.367us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.38%       5.250us         0.38%       5.250us       5.250us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.448ms
-Self CUDA time total: 104.253us
+Self CPU time total: 1.389ms
+Self CUDA time total: 104.161us
 
 
 
@@ -4466,27 +4466,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.006ms       813.90%       1.006ms       1.006ms             1  
-                                            torch_eager         9.94%     295.168us        99.80%       2.964ms       2.964ms       0.000us         0.00%     125.309us     125.309us             1  
-                                              aten::mul         5.76%     171.113us        10.09%     299.797us      12.492us      65.183us        52.76%      65.183us       2.716us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      65.183us        52.76%      65.183us       2.716us            24  
-                                            aten::copy_         3.36%      99.933us        70.61%       2.097ms     116.499us      39.102us        31.65%      40.862us       2.270us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.702us        23.23%      28.702us       2.392us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.264us        15.59%      19.264us       1.605us            12  
-                                            aten::clone         0.72%      21.400us        67.60%       2.008ms     334.638us       0.000us         0.00%      12.160us       2.027us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.400us         8.42%      10.400us       1.733us             6  
-                                              aten::sub         1.36%      40.342us         2.40%      71.402us      11.900us       9.664us         7.82%       9.664us       1.611us             6  
-                                              aten::add         1.19%      35.292us         2.11%      62.642us      10.440us       9.600us         7.77%       9.600us       1.600us             6  
-                                Activity Buffer Request        58.22%       1.729ms        58.22%       1.729ms       1.729ms       1.760us         1.42%       1.760us       1.760us             1  
-                                    aten::empty_strided         1.03%      30.470us         1.03%      30.470us       5.078us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         6.55%     194.565us         6.55%     194.565us      32.427us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.28%      67.682us         2.90%      86.242us       3.593us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.62%      18.560us         0.62%      18.560us       0.773us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.77%     260.395us         8.77%     260.395us       5.425us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.20%       5.880us         0.20%       5.880us       5.880us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     919.709us       742.30%     919.709us     919.709us             1  
+                                            torch_eager         9.60%     287.984us        99.81%       2.995ms       2.995ms       0.000us         0.00%     125.756us     125.756us             1  
+                                              aten::mul         5.08%     152.565us         8.72%     261.687us      10.904us      65.119us        52.56%      65.119us       2.713us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      65.119us        52.56%      65.119us       2.713us            24  
+                                            aten::copy_         3.44%     103.340us        73.23%       2.197ms     122.068us      39.390us        31.79%      41.246us       2.291us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.863us        23.30%      28.863us       2.405us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.391us        15.65%      19.391us       1.616us            12  
+                                            aten::clone         0.71%      21.441us        70.60%       2.118ms     353.061us       0.000us         0.00%      12.383us       2.064us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.527us         8.50%      10.527us       1.754us             6  
+                                              aten::add         1.05%      31.401us         1.75%      52.512us       8.752us       9.728us         7.85%       9.728us       1.621us             6  
+                                              aten::sub         1.23%      37.000us         2.00%      60.082us      10.014us       9.663us         7.80%       9.663us       1.611us             6  
+                                Activity Buffer Request        59.85%       1.796ms        59.85%       1.796ms       1.796ms       1.856us         1.50%       1.856us       1.856us             1  
+                                    aten::empty_strided         1.02%      30.521us         1.02%      30.521us       5.087us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         7.96%     238.696us         7.96%     238.696us      39.783us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.18%      65.422us         2.78%      83.321us       3.472us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.60%      17.899us         0.60%      17.899us       0.746us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         7.09%     212.797us         7.09%     212.797us       4.433us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.19%       5.701us         0.19%       5.701us       5.701us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.970ms
-Self CUDA time total: 123.549us
+Self CPU time total: 3.000ms
+Self CUDA time total: 123.900us
 
 
 
@@ -4496,27 +4496,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     983.089us       943.25%     983.089us     983.089us             1  
-                                            torch_eager        19.98%     288.033us        99.62%       1.436ms       1.436ms       0.000us         0.00%     105.536us     105.536us             1  
-                                              aten::mul        11.74%     169.216us        20.40%     294.119us      12.255us      55.424us        53.18%      55.424us       2.309us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      55.424us        53.18%      55.424us       2.309us            24  
-                                            aten::copy_         7.16%     103.226us        41.05%     591.856us      32.881us      32.352us        31.04%      33.664us       1.870us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.640us        23.64%      24.640us       2.053us            12  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     877.937us       842.38%     877.937us     877.937us             1  
+                                            torch_eager        19.95%     270.846us        99.61%       1.353ms       1.353ms       0.000us         0.00%     105.532us     105.532us             1  
+                                              aten::mul        11.07%     150.304us        18.76%     254.767us      10.615us      55.358us        53.12%      55.358us       2.307us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      55.358us        53.12%      55.358us       2.307us            24  
+                                            aten::copy_         7.05%      95.673us        43.37%     588.992us      32.722us      32.415us        31.10%      33.726us       1.874us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.672us        23.67%      24.672us       2.056us            12  
 void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.448us        15.78%      16.448us       1.371us            12  
-                                            aten::clone         1.52%      21.981us        35.14%     506.672us      84.445us       0.000us         0.00%       9.024us       1.504us             6  
-                                              aten::sub         2.73%      39.361us         4.47%      64.512us      10.752us       8.289us         7.95%       8.289us       1.381us             6  
-                                              aten::add         2.40%      34.591us         4.34%      62.531us      10.422us       8.159us         7.83%       8.159us       1.360us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.712us         7.40%       7.712us       1.285us             6  
-                                Activity Buffer Request        15.86%     228.735us        15.86%     228.735us     228.735us       1.312us         1.26%       1.312us       1.312us             1  
-                                    aten::empty_strided         2.19%      31.580us         2.19%      31.580us       5.263us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        13.29%     191.574us        13.29%     191.574us      31.929us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.47%      64.452us         5.67%      81.763us       3.407us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.20%      17.311us         1.20%      17.311us       0.721us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        17.08%     246.315us        17.08%     246.315us       5.132us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.38%       5.431us         0.38%       5.431us       5.431us       0.000us         0.00%       0.000us       0.000us             1  
+                                            aten::clone         1.39%      18.922us        38.11%     517.523us      86.254us       0.000us         0.00%       9.054us       1.509us             6  
+                                              aten::sub         2.65%      35.970us         4.36%      59.161us       9.860us       8.288us         7.95%       8.288us       1.381us             6  
+                                              aten::add         2.30%      31.190us         3.85%      52.221us       8.703us       8.160us         7.83%       8.160us       1.360us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.743us         7.43%       7.743us       1.290us             6  
+                                Activity Buffer Request        14.68%     199.344us        14.68%     199.344us     199.344us       1.311us         1.26%       1.311us       1.311us             1  
+                                    aten::empty_strided         2.13%      28.940us         2.13%      28.940us       4.823us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        17.49%     237.454us        17.49%     237.454us      39.576us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.56%      61.882us         5.80%      78.761us       3.282us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.24%      16.879us         1.24%      16.879us       0.703us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        15.11%     205.206us        15.11%     205.206us       4.275us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.39%       5.300us         0.39%       5.300us       5.300us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.442ms
-Self CUDA time total: 104.224us
+Self CPU time total: 1.358ms
+Self CUDA time total: 104.221us
 
 
 
@@ -4526,27 +4526,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.004ms       809.35%       1.004ms       1.004ms             1  
-                                            torch_eager        20.77%     293.259us        99.59%       1.406ms       1.406ms       0.000us         0.00%     125.785us     125.785us             1  
-                                              aten::mul        11.90%     168.115us        21.00%     296.496us      12.354us      65.470us        52.80%      65.470us       2.728us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      65.470us        52.80%      65.470us       2.728us            24  
-                                            aten::copy_         7.29%     102.893us        38.63%     545.534us      30.307us      39.326us        31.72%      41.117us       2.284us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.831us        23.25%      28.831us       2.403us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.198us        15.48%      19.198us       1.600us            12  
-                                            aten::clone         1.46%      20.620us        31.97%     451.491us      75.249us       0.000us         0.00%      12.286us       2.048us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.495us         8.46%      10.495us       1.749us             6  
-                                              aten::add         2.69%      38.013us         4.51%      63.752us      10.625us       9.599us         7.74%       9.599us       1.600us             6  
-                                              aten::sub         3.27%      46.223us         5.18%      73.182us      12.197us       9.599us         7.74%       9.599us       1.600us             6  
-                                Activity Buffer Request        12.70%     179.315us        12.70%     179.315us     179.315us       1.791us         1.44%       1.791us       1.791us             1  
-                                    aten::empty_strided         2.12%      29.941us         2.12%      29.941us       4.990us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        13.35%     188.535us        13.35%     188.535us      31.423us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.72%      66.700us         5.92%      83.651us       3.485us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.20%      16.951us         1.20%      16.951us       0.706us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        18.12%     255.870us        18.12%     255.870us       5.331us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.41%       5.720us         0.41%       5.720us       5.720us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     922.549us       744.77%     922.549us     922.549us             1  
+                                            torch_eager        21.60%     303.651us        99.57%       1.400ms       1.400ms       0.000us         0.00%     125.727us     125.727us             1  
+                                              aten::mul        10.95%     153.925us        18.62%     261.836us      10.910us      65.152us        52.60%      65.152us       2.715us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      65.152us        52.60%      65.152us       2.715us            24  
+                                            aten::copy_         6.95%      97.683us        42.26%     594.153us      33.008us      39.455us        31.85%      41.311us       2.295us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.863us        23.30%      28.863us       2.405us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.264us        15.55%      19.264us       1.605us            12  
+                                            aten::clone         1.38%      19.461us        36.95%     519.462us      86.577us       0.000us         0.00%      12.448us       2.075us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.592us         8.55%      10.592us       1.765us             6  
+                                              aten::add         2.18%      30.620us         3.65%      51.250us       8.542us       9.664us         7.80%       9.664us       1.611us             6  
+                                              aten::sub         2.57%      36.162us         4.29%      60.263us      10.044us       9.600us         7.75%       9.600us       1.600us             6  
+                                Activity Buffer Request        14.58%     204.944us        14.58%     204.944us     204.944us       1.856us         1.50%       1.856us       1.856us             1  
+                                    aten::empty_strided         2.07%      29.171us         2.07%      29.171us       4.862us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        16.64%     233.894us        16.64%     233.894us      38.982us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.45%      62.564us         5.70%      80.065us       3.336us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.24%      17.501us         1.24%      17.501us       0.729us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        14.96%     210.274us        14.96%     210.274us       4.381us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.43%       5.990us         0.43%       5.990us       5.990us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.412ms
-Self CUDA time total: 123.994us
+Self CPU time total: 1.406ms
+Self CUDA time total: 123.871us
 
 
 
@@ -4556,27 +4556,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.022ms       574.36%       1.022ms       1.022ms             1  
-                                            torch_eager         9.88%     300.789us        99.79%       3.040ms       3.040ms       0.000us         0.00%     180.741us     180.741us             1  
-                                              aten::mul         5.87%     178.699us        10.24%     311.746us      12.989us      95.331us        53.59%      95.331us       3.972us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      95.331us        53.59%      95.331us       3.972us            24  
-                                            aten::copy_         3.42%     104.263us        70.83%       2.158ms     119.861us      57.731us        32.45%      60.579us       3.366us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.803us        22.94%      40.803us       3.400us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.831us        13.96%      24.831us       2.069us            12  
-                                            aten::clone         0.72%      21.809us        67.85%       2.067ms     344.453us       0.000us         0.00%      19.776us       3.296us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.928us         9.52%      16.928us       2.821us             6  
-                                              aten::add         1.17%      35.582us         2.06%      62.693us      10.449us      12.480us         7.02%      12.480us       2.080us             6  
-                                              aten::sub         1.34%      40.961us         2.23%      67.853us      11.309us      12.351us         6.94%      12.351us       2.059us             6  
-                                Activity Buffer Request        58.86%       1.793ms        58.86%       1.793ms       1.793ms       2.848us         1.60%       2.848us       2.848us             1  
-                                    aten::empty_strided         1.00%      30.461us         1.00%      30.461us       5.077us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         6.14%     187.135us         6.14%     187.135us      31.189us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.22%      67.750us         2.85%      86.690us       3.612us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.62%      18.940us         0.62%      18.940us       0.789us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.55%     260.511us         8.55%     260.511us       5.427us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.21%       6.321us         0.21%       6.321us       6.321us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     897.539us       506.00%     897.539us     897.539us             1  
+                                            torch_eager         9.74%     279.839us        99.80%       2.869ms       2.869ms       0.000us         0.00%     180.226us     180.226us             1  
+                                              aten::mul         5.31%     152.543us         9.01%     258.914us      10.788us      94.688us        53.38%      94.688us       3.945us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      94.688us        53.38%      94.688us       3.945us            24  
+                                            aten::copy_         3.38%      97.055us        72.52%       2.084ms     115.804us      57.633us        32.49%      60.481us       3.360us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.641us        22.91%      40.641us       3.387us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      25.057us        14.13%      25.057us       2.088us            12  
+                                            aten::clone         0.77%      22.210us        70.07%       2.014ms     335.666us       0.000us         0.00%      19.840us       3.307us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.992us         9.58%      16.992us       2.832us             6  
+                                              aten::add         1.09%      31.359us         1.84%      52.920us       8.820us      12.577us         7.09%      12.577us       2.096us             6  
+                                              aten::sub         1.28%      36.869us         2.08%      59.850us       9.975us      12.480us         7.04%      12.480us       2.080us             6  
+                                Activity Buffer Request        59.04%       1.697ms        59.04%       1.697ms       1.697ms       2.848us         1.61%       2.848us       2.848us             1  
+                                    aten::empty_strided         1.03%      29.731us         1.03%      29.731us       4.955us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         8.09%     232.664us         8.09%     232.664us      38.777us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.21%      63.552us         2.81%      80.641us       3.360us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.59%      17.089us         0.59%      17.089us       0.712us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         7.26%     208.724us         7.26%     208.724us       4.348us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.20%       5.620us         0.20%       5.620us       5.620us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 3.046ms
-Self CUDA time total: 177.893us
+Self CPU time total: 2.874ms
+Self CUDA time total: 177.378us
 
 
 
@@ -4586,27 +4586,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.032ms       346.83%       1.032ms       1.032ms             1  
-                                            torch_eager        10.94%     330.454us        99.81%       3.014ms       3.014ms       0.000us         0.00%     315.361us     315.361us             1  
-                                              aten::mul         5.75%     173.643us        10.12%     305.657us      12.736us     145.696us        48.95%     145.696us       6.071us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     145.696us        48.95%     145.696us       6.071us            24  
-                                            aten::copy_         3.49%     105.289us        69.61%       2.102ms     116.764us     110.818us        37.23%     128.546us       7.141us            18  
-                                            aten::clone         0.91%      27.529us        66.74%       2.015ms     335.845us       0.000us         0.00%      71.233us      11.872us             6  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.313us        19.26%      57.313us       4.776us            12  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      53.505us        17.98%      53.505us       8.918us             6  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      41.119us        13.82%      41.119us       3.427us            12  
-                                              aten::sub         1.33%      40.040us         2.29%      68.991us      11.499us      20.607us         6.92%      20.607us       3.434us             6  
-                                              aten::add         1.18%      35.771us         2.04%      61.541us      10.257us      20.512us         6.89%      20.512us       3.419us             6  
-                                Activity Buffer Request        56.83%       1.716ms        56.83%       1.716ms       1.716ms      17.728us         5.96%      17.728us      17.728us             1  
-                                    aten::empty_strided         1.04%      31.352us         1.04%      31.352us       5.225us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         6.87%     207.436us         6.87%     207.436us      34.573us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.27%      68.592us         2.86%      86.271us       3.595us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.59%      17.679us         0.59%      17.679us       0.737us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.61%     259.847us         8.61%     259.847us       5.413us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.19%       5.690us         0.19%       5.690us       5.690us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     898.810us       301.67%     898.810us     898.810us             1  
+                                            torch_eager         9.55%     282.366us        99.82%       2.951ms       2.951ms       0.000us         0.00%     315.933us     315.933us             1  
+                                              aten::mul         5.12%     151.293us         8.70%     257.314us      10.721us     145.309us        48.77%     145.309us       6.055us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     145.309us        48.77%     145.309us       6.055us            24  
+                                            aten::copy_         3.22%      95.316us        73.31%       2.167ms     120.400us     111.488us        37.42%     129.472us       7.193us            18  
+                                            aten::clone         0.72%      21.328us        70.93%       2.097ms     349.496us       0.000us         0.00%      72.192us      12.032us             6  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.280us        19.22%      57.280us       4.773us            12  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      54.208us        18.19%      54.208us       9.035us             6  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      41.152us        13.81%      41.152us       3.429us            12  
+                                              aten::sub         1.21%      35.903us         1.98%      58.432us       9.739us      20.607us         6.92%      20.607us       3.434us             6  
+                                              aten::add         1.05%      31.001us         1.75%      51.651us       8.608us      20.545us         6.90%      20.545us       3.424us             6  
+                                Activity Buffer Request        60.43%       1.786ms        60.43%       1.786ms       1.786ms      17.984us         6.04%      17.984us      17.984us             1  
+                                    aten::empty_strided         1.00%      29.601us         1.00%      29.601us       4.934us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         7.72%     228.244us         7.72%     228.244us      38.041us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.20%      64.933us         2.81%      83.072us       3.461us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.61%      18.139us         0.61%      18.139us       0.756us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         6.98%     206.421us         6.98%     206.421us       4.300us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.18%       5.270us         0.18%       5.270us       5.270us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 3.019ms
-Self CUDA time total: 297.633us
+Self CPU time total: 2.956ms
+Self CUDA time total: 297.949us
 
 
 
@@ -4616,27 +4616,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     997.224us       560.81%     997.224us     997.224us             1  
-                                            torch_eager        20.41%     294.666us        99.60%       1.438ms       1.438ms       0.000us         0.00%     180.699us     180.699us             1  
-                                              aten::mul        11.81%     170.426us        20.82%     300.510us      12.521us      95.263us        53.57%      95.263us       3.969us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      95.263us        53.57%      95.263us       3.969us            24  
-                                            aten::copy_         6.98%     100.734us        40.22%     580.546us      32.253us      57.725us        32.46%      60.604us       3.367us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.895us        23.00%      40.895us       3.408us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.832us        13.96%      24.832us       2.069us            12  
-                                            aten::clone         1.48%      21.402us        34.03%     491.202us      81.867us       0.000us         0.00%      19.709us       3.285us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.830us         9.46%      16.830us       2.805us             6  
-                                              aten::sub         2.72%      39.260us         4.81%      69.410us      11.568us      12.416us         6.98%      12.416us       2.069us             6  
-                                              aten::add         2.46%      35.491us         4.17%      60.132us      10.022us      12.416us         6.98%      12.416us       2.069us             6  
-                                Activity Buffer Request        14.76%     213.066us        14.76%     213.066us     213.066us       2.879us         1.62%       2.879us       2.879us             1  
-                                    aten::empty_strided         2.03%      29.329us         2.03%      29.329us       4.888us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        13.55%     195.534us        13.55%     195.534us      32.589us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.52%      65.271us         5.66%      81.651us       3.402us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.13%      16.380us         1.13%      16.380us       0.683us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        17.74%     256.087us        17.74%     256.087us       5.335us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.40%       5.801us         0.40%       5.801us       5.801us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     892.604us       503.24%     892.604us     892.604us             1  
+                                            torch_eager        19.40%     275.276us        99.62%       1.413ms       1.413ms       0.000us         0.00%     180.252us     180.252us             1  
+                                              aten::mul        10.76%     152.603us        18.14%     257.417us      10.726us      94.815us        53.46%      94.815us       3.951us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      94.815us        53.46%      94.815us       3.951us            24  
+                                            aten::copy_         6.67%      94.615us        44.72%     634.467us      35.248us      57.598us        32.47%      60.478us       3.360us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.510us        22.84%      40.510us       3.376us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.959us        14.07%      24.959us       2.080us            12  
+                                            aten::clone         1.36%      19.279us        39.62%     562.091us      93.682us       0.000us         0.00%      19.968us       3.328us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.088us         9.63%      17.088us       2.848us             6  
+                                              aten::add         2.18%      30.992us         4.25%      60.342us      10.057us      12.607us         7.11%      12.607us       2.101us             6  
+                                              aten::sub         2.49%      35.261us         4.05%      57.481us       9.580us      12.352us         6.96%      12.352us       2.059us             6  
+                                Activity Buffer Request        18.10%     256.816us        18.10%     256.816us     256.816us       2.880us         1.62%       2.880us       2.880us             1  
+                                    aten::empty_strided         2.07%      29.380us         2.07%      29.380us       4.897us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        15.91%     225.774us        15.91%     225.774us      37.629us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.38%      62.100us         5.62%      79.779us       3.324us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.25%      17.679us         1.25%      17.679us       0.737us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        15.06%     213.646us        15.06%     213.646us       4.451us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.38%       5.461us         0.38%       5.461us       5.461us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.443ms
-Self CUDA time total: 177.820us
+Self CPU time total: 1.419ms
+Self CUDA time total: 177.372us
 
 
 
@@ -4646,27 +4646,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.006ms       337.57%       1.006ms       1.006ms             1  
-                                            torch_eager        19.59%     286.301us        99.64%       1.456ms       1.456ms       0.000us         0.00%     316.065us     316.065us             1  
-                                              aten::mul        11.95%     174.684us        20.79%     303.846us      12.660us     145.439us        48.82%     145.439us       6.060us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     145.439us        48.82%     145.439us       6.060us            24  
-                                            aten::copy_         7.17%     104.791us        40.98%     598.785us      33.266us     111.649us        37.48%     129.826us       7.213us            18  
-                                            aten::clone         1.35%      19.700us        34.46%     503.492us      83.915us       0.000us         0.00%      72.450us      12.075us             6  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.376us        19.26%      57.376us       4.781us            12  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      54.273us        18.22%      54.273us       9.045us             6  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      40.800us        13.70%      40.800us       3.400us            12  
-                                              aten::sub         2.64%      38.590us         4.63%      67.691us      11.282us      20.448us         6.86%      20.448us       3.408us             6  
-                                              aten::add         2.53%      37.009us         4.42%      64.522us      10.754us      20.352us         6.83%      20.352us       3.392us             6  
-                                Activity Buffer Request        15.77%     230.486us        15.77%     230.486us     230.486us      18.177us         6.10%      18.177us      18.177us             1  
-                                    aten::empty_strided         2.02%      29.450us         2.02%      29.450us       4.908us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        12.98%     189.676us        12.98%     189.676us      31.613us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.60%      67.290us         5.86%      85.691us       3.570us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.26%      18.401us         1.26%      18.401us       0.767us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        17.77%     259.608us        17.77%     259.608us       5.409us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.36%       5.300us         0.36%       5.300us       5.300us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     993.373us       332.44%     993.373us     993.373us             1  
+                                            torch_eager        19.67%     287.571us        99.61%       1.456ms       1.456ms       0.000us         0.00%     317.118us     317.118us             1  
+                                              aten::mul        10.73%     156.833us        18.24%     266.665us      11.111us     145.536us        48.70%     145.536us       6.064us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     145.536us        48.70%     145.536us       6.064us            24  
+                                            aten::copy_         6.97%     101.849us        44.88%     655.991us      36.444us     112.414us        37.62%     130.718us       7.262us            18  
+                                            aten::clone         1.31%      19.190us        34.67%     506.790us      84.465us       0.000us         0.00%      73.472us      12.245us             6  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.246us        19.16%      57.246us       4.770us            12  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      55.168us        18.46%      55.168us       9.195us             6  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      40.864us        13.68%      40.864us       3.405us            12  
+                                              aten::sub         2.57%      37.523us         4.19%      61.312us      10.219us      20.447us         6.84%      20.447us       3.408us             6  
+                                              aten::add         2.11%      30.781us         3.53%      51.651us       8.608us      20.417us         6.83%      20.417us       3.403us             6  
+                                Activity Buffer Request        13.85%     202.395us        13.85%     202.395us     202.395us      18.304us         6.13%      18.304us      18.304us             1  
+                                    aten::empty_strided         2.07%      30.250us         2.07%      30.250us       5.042us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        15.36%     224.584us        15.36%     224.584us      37.431us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.45%      65.071us         5.71%      83.402us       3.475us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.25%      18.331us         1.25%      18.331us       0.764us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        19.27%     281.654us        19.27%     281.654us       5.868us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.39%       5.741us         0.39%       5.741us       5.741us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.461ms
-Self CUDA time total: 297.888us
+Self CPU time total: 1.462ms
+Self CUDA time total: 298.814us
 
 
 
@@ -4676,27 +4676,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.028ms       173.98%       1.028ms       1.028ms             1  
-                                            torch_eager        20.54%     307.941us        99.60%       1.493ms       1.493ms       0.000us         0.00%     614.720us     614.720us             1  
-                                            aten::copy_         6.96%     104.343us        39.80%     596.653us      33.147us     277.825us        47.01%     301.537us      16.752us            18  
-                                              aten::mul        11.79%     176.823us        20.94%     313.949us      13.081us     247.103us        41.81%     247.103us      10.296us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     247.103us        41.81%     247.103us      10.296us            24  
-                                            aten::clone         1.37%      20.599us        33.38%     500.402us      83.400us       0.000us         0.00%     210.816us      35.136us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     187.104us        31.66%     187.104us      31.184us             6  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      90.721us        15.35%      90.721us       7.560us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      66.080us        11.18%      66.080us       5.507us            12  
-                                              aten::add         2.50%      37.532us         4.42%      66.263us      11.044us      33.056us         5.59%      33.056us       5.509us             6  
-                                              aten::sub         2.78%      41.622us         4.80%      72.031us      12.005us      33.024us         5.59%      33.024us       5.504us             6  
-                                Activity Buffer Request        15.35%     230.085us        15.35%     230.085us     230.085us      23.712us         4.01%      23.712us      23.712us             1  
-                                    aten::empty_strided         1.95%      29.191us         1.95%      29.191us       4.865us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        12.52%     187.674us        12.52%     187.674us      31.279us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.56%      68.431us         5.78%      86.660us       3.611us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.22%      18.229us         1.22%      18.229us       0.760us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        18.06%     270.817us        18.06%     270.817us       5.642us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.40%       6.020us         0.40%       6.020us       6.020us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     897.247us       151.96%     897.247us     897.247us             1  
+                                            torch_eager        20.14%     274.634us        99.62%       1.358ms       1.358ms       0.000us         0.00%     614.305us     614.305us             1  
+                                            aten::copy_         6.96%      94.863us        42.61%     580.932us      32.274us     278.975us        47.25%     302.847us      16.825us            18  
+                                              aten::mul        11.17%     152.301us        19.13%     260.794us      10.866us     245.758us        41.62%     245.758us      10.240us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     245.758us        41.62%     245.758us      10.240us            24  
+                                            aten::clone         1.47%      20.049us        37.19%     507.071us      84.512us       0.000us         0.00%     212.225us      35.371us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     188.353us        31.90%     188.353us      31.392us             6  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      90.622us        15.35%      90.622us       7.552us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      65.700us        11.13%      65.700us       5.475us            12  
+                                              aten::add         2.26%      30.880us         3.85%      52.533us       8.756us      32.930us         5.58%      32.930us       5.488us             6  
+                                              aten::sub         2.73%      37.222us         4.45%      60.643us      10.107us      32.770us         5.55%      32.770us       5.462us             6  
+                                Activity Buffer Request        15.15%     206.524us        15.15%     206.524us     206.524us      23.872us         4.04%      23.872us      23.872us             1  
+                                    aten::empty_strided         2.13%      29.022us         2.13%      29.022us       4.837us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        16.16%     220.325us        16.16%     220.325us      36.721us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.58%      62.481us         5.83%      79.533us       3.314us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.25%      17.052us         1.25%      17.052us       0.711us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        15.61%     212.787us        15.61%     212.787us       4.433us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.38%       5.230us         0.38%       5.230us       5.230us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.499ms
-Self CUDA time total: 591.008us
+Self CPU time total: 1.363ms
+Self CUDA time total: 590.433us
 
 
 
@@ -4706,59 +4706,59 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager        13.10%     290.757us        64.42%       1.430ms       1.430ms       0.000us         0.00%       1.858ms       1.858ms             1  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.829ms       102.08%       1.829ms       1.829ms             1  
-                                            aten::copy_         4.50%      99.996us        26.00%     577.207us      32.067us     806.683us        45.02%     872.922us      48.496us            18  
-                                              aten::mul         7.46%     165.620us        13.35%     296.315us      12.346us     837.369us        46.73%     837.369us      34.890us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     837.369us        46.73%     837.369us      34.890us            24  
-                                            aten::clone         0.93%      20.729us        21.91%     486.420us      81.070us       0.000us         0.00%     620.507us     103.418us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     554.268us        30.93%     554.268us      92.378us             6  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     252.415us        14.09%     252.415us      21.035us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     147.935us         8.26%     147.935us      12.328us            12  
-                                              aten::sub         1.77%      39.361us         2.98%      66.203us      11.034us      89.536us         5.00%      89.536us      14.923us             6  
-                                Activity Buffer Request         8.80%     195.395us         8.80%     195.395us     195.395us      66.239us         3.70%      66.239us      66.239us             1  
-                                              aten::add         1.55%      34.513us         2.81%      62.423us      10.404us      58.399us         3.26%      58.399us       9.733us             6  
-                                    aten::empty_strided         1.34%      29.799us         1.34%      29.799us       4.967us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         9.39%     208.344us         9.39%     208.344us      34.724us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.12%      69.193us         3.90%      86.553us       3.606us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.78%      17.360us         0.78%      17.360us       0.723us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        11.66%     258.919us        11.66%     258.919us       5.394us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize        35.58%     789.949us        35.58%     789.949us     789.949us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager        12.28%     274.098us        60.28%       1.346ms       1.346ms       0.000us         0.00%       1.856ms       1.856ms             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.827ms       102.10%       1.827ms       1.827ms             1  
+                                            aten::copy_         4.28%      95.634us        25.22%     562.931us      31.274us     804.864us        44.98%     871.488us      48.416us            18  
+                                              aten::mul         6.88%     153.564us        11.78%     262.937us      10.956us     837.983us        46.83%     837.983us      34.916us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     837.983us        46.83%     837.983us      34.916us            24  
+                                            aten::clone         0.88%      19.740us        22.00%     491.160us      81.860us       0.000us         0.00%     619.552us     103.259us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     552.928us        30.90%     552.928us      92.155us             6  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     251.936us        14.08%     251.936us      20.995us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     146.496us         8.19%     146.496us      12.208us            12  
+                                              aten::sub         1.66%      37.140us         2.70%      60.281us      10.047us      88.032us         4.92%      88.032us      14.672us             6  
+                                Activity Buffer Request         8.33%     186.034us         8.33%     186.034us     186.034us      66.624us         3.72%      66.624us      66.624us             1  
+                                              aten::add         1.41%      31.471us         2.39%      53.240us       8.873us      58.464us         3.27%      58.464us       9.744us             6  
+                                    aten::empty_strided         1.36%      30.441us         1.36%      30.441us       5.074us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        10.01%     223.534us        10.01%     223.534us      37.256us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.89%      64.468us         3.67%      81.941us       3.414us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.78%      17.473us         0.78%      17.473us       0.728us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         9.50%     212.012us         9.50%     212.012us       4.417us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize        39.72%     886.609us        39.72%     886.609us     886.609us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.220ms
-Self CUDA time total: 1.792ms
+Self CPU time total: 2.232ms
+Self CUDA time total: 1.789ms
 
 
 impl                     wl                  p50(ms)  ok
-torch_eager              cuda_B1_S128_H32_D128_R64     0.27  True
-torch_eager              cuda_B1_S128_H32_D64_R32     0.25  True
-torch_eager              cuda_B1_S128_H8_D128_R64     0.25  True
-torch_eager              cuda_B1_S128_H8_D64_R32     0.19  True
-torch_eager              cuda_B1_S2048_H32_D128_R64     0.25  True
-torch_eager              cuda_B1_S2048_H32_D64_R32     0.25  True
-torch_eager              cuda_B1_S2048_H8_D128_R64     0.24  True
-torch_eager              cuda_B1_S2048_H8_D64_R32     0.25  True
-torch_eager              cuda_B1_S512_H32_D128_R64     0.24  True
-torch_eager              cuda_B1_S512_H32_D64_R32     0.24  True
-torch_eager              cuda_B1_S512_H8_D128_R64     0.24  True
-torch_eager              cuda_B1_S512_H8_D64_R32     0.25  True
-torch_eager              cuda_B2_S128_H32_D128_R64     0.24  True
-torch_eager              cuda_B2_S128_H32_D64_R32     0.25  True
-torch_eager              cuda_B2_S128_H8_D128_R64     0.25  True
-torch_eager              cuda_B2_S128_H8_D64_R32     0.24  True
-torch_eager              cuda_B2_S2048_H32_D128_R64     0.65  True
-torch_eager              cuda_B2_S2048_H32_D64_R32     0.25  True
-torch_eager              cuda_B2_S2048_H8_D128_R64     0.25  True
-torch_eager              cuda_B2_S2048_H8_D64_R32     0.24  True
-torch_eager              cuda_B2_S512_H32_D128_R64     0.25  True
-torch_eager              cuda_B2_S512_H32_D64_R32     0.24  True
-torch_eager              cuda_B2_S512_H8_D128_R64     0.24  True
-torch_eager              cuda_B2_S512_H8_D64_R32     0.24  True
+torch_eager              cuda_B1_S128_H32_D128_R64     0.22  True
+torch_eager              cuda_B1_S128_H32_D64_R32     0.22  True
+torch_eager              cuda_B1_S128_H8_D128_R64     0.22  True
+torch_eager              cuda_B1_S128_H8_D64_R32     0.17  True
+torch_eager              cuda_B1_S2048_H32_D128_R64     0.23  True
+torch_eager              cuda_B1_S2048_H32_D64_R32     0.22  True
+torch_eager              cuda_B1_S2048_H8_D128_R64     0.22  True
+torch_eager              cuda_B1_S2048_H8_D64_R32     0.22  True
+torch_eager              cuda_B1_S512_H32_D128_R64     0.22  True
+torch_eager              cuda_B1_S512_H32_D64_R32     0.21  True
+torch_eager              cuda_B1_S512_H8_D128_R64     0.21  True
+torch_eager              cuda_B1_S512_H8_D64_R32     0.22  True
+torch_eager              cuda_B2_S128_H32_D128_R64     0.21  True
+torch_eager              cuda_B2_S128_H32_D64_R32     0.21  True
+torch_eager              cuda_B2_S128_H8_D128_R64     0.22  True
+torch_eager              cuda_B2_S128_H8_D64_R32     0.21  True
+torch_eager              cuda_B2_S2048_H32_D128_R64     0.64  True
+torch_eager              cuda_B2_S2048_H32_D64_R32     0.23  True
+torch_eager              cuda_B2_S2048_H8_D128_R64     0.22  True
+torch_eager              cuda_B2_S2048_H8_D64_R32     0.22  True
+torch_eager              cuda_B2_S512_H32_D128_R64     0.22  True
+torch_eager              cuda_B2_S512_H32_D64_R32     0.22  True
+torch_eager              cuda_B2_S512_H8_D128_R64     0.21  True
+torch_eager              cuda_B2_S512_H8_D64_R32     0.21  True
 
▶ UV Install Logs
diff --git a/rotary/index.html b/rotary/index.html index 34ae4d24f7821cde5b1c6e19b0c12afacbbfa2f0..997349ef6ba0f98a5afdbb285dbd667023c071e8 100644 --- a/rotary/index.html +++ b/rotary/index.html @@ -3874,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
Generated on:
- Linux x86_64 | Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35 + Darwin arm64 | macOS-15.7.2-arm64-arm-64bit
diff --git a/rotary/results/artifacts/combine/latency.svg b/rotary/results/artifacts/combine/latency.svg index 98c53495ef8811148d17a31fad132efc88ecce65..e9b78a4025027c1aa711ddc44bf104d3d7442727 100644 --- a/rotary/results/artifacts/combine/latency.svg +++ b/rotary/results/artifacts/combine/latency.svg @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b50b39073ca470536083df3bd46c41d2c32d623cb6c07d81e1425542a6d29446 -size 37864 +oid sha256:5520e194a23ff7d6256419fb71b63140879250c745256b97f5863e14fe78a38b +size 37850 diff --git a/rotary/results/combined_results.html b/rotary/results/combined_results.html index 35b315157eac057055edd241aff84369bf3fa60b..33c934033048e87a115e4174b35f10fea665008d 100644 --- a/rotary/results/combined_results.html +++ b/rotary/results/combined_results.html @@ -3889,7 +3889,7 @@ body[data-tool="eraser"] .main-content { - 2025-12-19T19:55:53.341578 + 2025-12-19T23:02:58.818918 image/svg+xml @@ -4233,109 +4233,109 @@ body[data-tool="eraser"] .main-content { - + - + - 0.1 + 0.1 - + - + - 0.2 + 0.2 - + - + - 0.3 + 0.3 - + - + - 0.4 + 0.4 - + - + - 0.5 + 0.5 - + - + - 0.6 + 0.6 - + - + - 0.7 + 0.7 - + - + - 0.8 + 0.8 @@ -4343,67 +4343,67 @@ body[data-tool="eraser"] .main-content { - + - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + - + - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + @@ -4461,7 +4461,7 @@ body[data-tool="eraser"] .main-content { ▼ output ▶ uv-logs | -Cell: combine | 4.98s +Cell: combine | 4.50s | Raw @@ -4551,7 +4551,7 @@ impl wl p50(ms) ok hf_kernels_rotary cuda_B1_S128_H32_D128_R64 0.09 True hf_kernels_rotary cuda_B1_S128_H32_D64_R32 0.09 True hf_kernels_rotary cuda_B1_S128_H8_D128_R64 0.09 True -hf_kernels_rotary cuda_B1_S128_H8_D64_R32 0.08 True +hf_kernels_rotary cuda_B1_S128_H8_D64_R32 0.07 True hf_kernels_rotary cuda_B1_S2048_H32_D128_R64 0.26 True hf_kernels_rotary cuda_B1_S2048_H32_D64_R32 0.09 True hf_kernels_rotary cuda_B1_S2048_H8_D128_R64 0.09 True @@ -4572,30 +4572,30 @@ hf_kernels_rotary cuda_B2_S512_H32_D128_R64 0.09 True hf_kernels_rotary cuda_B2_S512_H32_D64_R32 0.09 True hf_kernels_rotary cuda_B2_S512_H8_D128_R64 0.09 True hf_kernels_rotary cuda_B2_S512_H8_D64_R32 0.09 True -torch_eager cuda_B1_S128_H32_D128_R64 0.27 True -torch_eager cuda_B1_S128_H32_D64_R32 0.25 True -torch_eager cuda_B1_S128_H8_D128_R64 0.25 True -torch_eager cuda_B1_S128_H8_D64_R32 0.19 True -torch_eager cuda_B1_S2048_H32_D128_R64 0.25 True -torch_eager cuda_B1_S2048_H32_D64_R32 0.25 True -torch_eager cuda_B1_S2048_H8_D128_R64 0.24 True -torch_eager cuda_B1_S2048_H8_D64_R32 0.25 True -torch_eager cuda_B1_S512_H32_D128_R64 0.24 True -torch_eager cuda_B1_S512_H32_D64_R32 0.24 True -torch_eager cuda_B1_S512_H8_D128_R64 0.24 True -torch_eager cuda_B1_S512_H8_D64_R32 0.25 True -torch_eager cuda_B2_S128_H32_D128_R64 0.24 True -torch_eager cuda_B2_S128_H32_D64_R32 0.25 True -torch_eager cuda_B2_S128_H8_D128_R64 0.25 True -torch_eager cuda_B2_S128_H8_D64_R32 0.24 True -torch_eager cuda_B2_S2048_H32_D128_R64 0.65 True -torch_eager cuda_B2_S2048_H32_D64_R32 0.25 True -torch_eager cuda_B2_S2048_H8_D128_R64 0.25 True -torch_eager cuda_B2_S2048_H8_D64_R32 0.24 True -torch_eager cuda_B2_S512_H32_D128_R64 0.25 True -torch_eager cuda_B2_S512_H32_D64_R32 0.24 True -torch_eager cuda_B2_S512_H8_D128_R64 0.24 True -torch_eager cuda_B2_S512_H8_D64_R32 0.24 True +torch_eager cuda_B1_S128_H32_D128_R64 0.22 True +torch_eager cuda_B1_S128_H32_D64_R32 0.22 True +torch_eager cuda_B1_S128_H8_D128_R64 0.22 True +torch_eager cuda_B1_S128_H8_D64_R32 0.17 True +torch_eager cuda_B1_S2048_H32_D128_R64 0.23 True +torch_eager cuda_B1_S2048_H32_D64_R32 0.22 True +torch_eager cuda_B1_S2048_H8_D128_R64 0.22 True +torch_eager cuda_B1_S2048_H8_D64_R32 0.22 True +torch_eager cuda_B1_S512_H32_D128_R64 0.22 True +torch_eager cuda_B1_S512_H32_D64_R32 0.21 True +torch_eager cuda_B1_S512_H8_D128_R64 0.21 True +torch_eager cuda_B1_S512_H8_D64_R32 0.22 True +torch_eager cuda_B2_S128_H32_D128_R64 0.21 True +torch_eager cuda_B2_S128_H32_D64_R32 0.21 True +torch_eager cuda_B2_S128_H8_D128_R64 0.22 True +torch_eager cuda_B2_S128_H8_D64_R32 0.21 True +torch_eager cuda_B2_S2048_H32_D128_R64 0.64 True +torch_eager cuda_B2_S2048_H32_D64_R32 0.23 True +torch_eager cuda_B2_S2048_H8_D128_R64 0.22 True +torch_eager cuda_B2_S2048_H8_D64_R32 0.22 True +torch_eager cuda_B2_S512_H32_D128_R64 0.22 True +torch_eager cuda_B2_S512_H32_D64_R32 0.22 True +torch_eager cuda_B2_S512_H8_D128_R64 0.21 True +torch_eager cuda_B2_S512_H8_D64_R32 0.21 True GENERATING COMBINED VISUALIZATION @@ -4615,7 +4615,7 @@ Implementations included:
▶ UV Install Logs
@@ -4628,7 +4628,7 @@ Installed 37 packages in 314ms - 2025-12-19T19:55:53.341578 + 2025-12-19T23:02:58.818918 image/svg+xml @@ -4972,109 +4972,109 @@ Installed 37 packages in 314ms - + - + - 0.1 + 0.1 - + - + - 0.2 + 0.2 - + - + - 0.3 + 0.3 - + - + - 0.4 + 0.4 - + - + - 0.5 + 0.5 - + - + - 0.6 + 0.6 - + - + - 0.7 + 0.7 - + - + - 0.8 + 0.8 @@ -5082,67 +5082,67 @@ Installed 37 packages in 314ms - + - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + - + - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + +