+
+
+
+
+
+
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# "numpy",
+# "torch==2.8.0",
+# "kernels-benchmark-tools",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+import torch
+import sys
+from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
+
+
+def torch_mem_eff(q, k, v):
+ qt, kt, vt = (x.transpose(1, 2).contiguous() for x in (q, k, v))
+ with torch.nn.attention.sdpa_kernel(
+ torch.nn.attention.SDPBackend.EFFICIENT_ATTENTION
+ ):
+ o = torch.nn.functional.scaled_dot_product_attention(qt, kt, vt)
+ return o.transpose(1, 2).contiguous()
+
+
+run_benchmark(
+ kernel_type=KernelTypeEnum.ATTENTION,
+ impl_name="torch_mem_eff",
+ impl_tags={"family": "torch-sdpa", "backend": "EFFICIENT", "compile": "none"},
+ impl_func=torch_mem_eff,
+)
+
+
+
+Running attention benchmark on cuda with 6 workloads. + +====================================================================== +PROFILE TRACE: torch_mem_eff | cuda_attn_L128_bfloat16 +====================================================================== +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ + Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ + torch_mem_eff 4.95% 352.351us 32.76% 2.334ms 2.334ms 0.000us 0.00% 5.540ms 5.540ms 1 + torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.523ms 100.61% 5.523ms 5.523ms 1 + aten::scaled_dot_product_attention 0.42% 30.002us 2.65% 188.407us 62.802us 0.000us 0.00% 4.866ms 1.622ms 3 + aten::_scaled_dot_product_efficient_attention 0.34% 24.112us 2.22% 158.405us 52.802us 0.000us 0.00% 4.866ms 1.622ms 3 + aten::_efficient_attention_forward 0.50% 35.512us 1.50% 106.553us 35.518us 4.866ms 88.65% 4.866ms 1.622ms 3 +fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 4.866ms 88.65% 4.866ms 1.622ms 3 + aten::contiguous 0.17% 12.230us 24.19% 1.723ms 191.466us 0.000us 0.00% 673.885us 74.876us 9 + aten::clone 0.48% 34.032us 24.02% 1.711ms 190.107us 0.000us 0.00% 673.885us 74.876us 9 + aten::copy_ 1.04% 73.980us 22.51% 1.603ms 178.136us 623.037us 11.35% 673.885us 74.876us 9 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 623.037us 11.35% 623.037us 69.226us 9 + Activity Buffer Request 20.23% 1.441ms 20.23% 1.441ms 1.441ms 50.848us 0.93% 50.848us 50.848us 1 + aten::transpose 1.03% 73.058us 1.37% 97.392us 4.058us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.34% 24.334us 0.34% 24.334us 1.014us 0.000us 0.00% 0.000us 0.000us 24 + aten::empty_like 0.28% 19.590us 1.03% 73.701us 8.189us 0.000us 0.00% 0.000us 0.000us 9 + aten::empty 1.26% 89.621us 1.26% 89.621us 4.268us 0.000us 0.00% 0.000us 0.000us 21 + cudaLaunchKernel 1.58% 112.598us 1.58% 112.598us 9.383us 0.000us 0.00% 0.000us 0.000us 12 + cudaStreamIsCapturing 0.04% 3.160us 0.04% 3.160us 1.053us 0.000us 0.00% 0.000us 0.000us 3 + cudaFuncSetAttribute 0.12% 8.400us 0.12% 8.400us 2.800us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 67.24% 4.789ms 67.24% 4.789ms 4.789ms 0.000us 0.00% 0.000us 0.000us 1 +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ +Self CPU time total: 7.123ms +Self CUDA time total: 5.489ms + + + +====================================================================== +PROFILE TRACE: torch_mem_eff | cuda_attn_L256_bfloat16 +====================================================================== +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ + Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ + torch_mem_eff 3.15% 231.099us 27.84% 2.044ms 2.044ms 0.000us 0.00% 5.902ms 5.902ms 1 + torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.856ms 100.14% 5.856ms 5.856ms 1 + aten::scaled_dot_product_attention 0.26% 19.041us 1.91% 140.484us 46.828us 0.000us 0.00% 5.210ms 1.737ms 3 + aten::_scaled_dot_product_efficient_attention 0.25% 18.340us 1.65% 121.443us 40.481us 0.000us 0.00% 5.210ms 1.737ms 3 + aten::_efficient_attention_forward 0.40% 29.263us 1.10% 80.783us 26.928us 5.210ms 89.09% 5.210ms 1.737ms 3 +fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.210ms 89.09% 5.210ms 1.737ms 3 + aten::contiguous 0.10% 7.239us 22.19% 1.629ms 181.023us 0.000us 0.00% 692.607us 76.956us 9 + aten::clone 0.29% 21.632us 22.09% 1.622ms 180.219us 0.000us 0.00% 692.607us 76.956us 9 + aten::copy_ 0.87% 63.554us 21.13% 1.551ms 172.359us 638.271us 10.91% 692.607us 76.956us 9 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 638.271us 10.91% 638.271us 70.919us 9 + Activity Buffer Request 19.39% 1.423ms 19.39% 1.423ms 1.423ms 54.336us 0.93% 54.336us 54.336us 1 + aten::transpose 0.66% 48.509us 0.89% 65.581us 2.733us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.23% 17.072us 0.23% 17.072us 0.711us 0.000us 0.00% 0.000us 0.000us 24 + aten::empty_like 0.16% 11.700us 0.67% 49.102us 5.456us 0.000us 0.00% 0.000us 0.000us 9 + aten::empty 0.83% 61.232us 0.83% 61.232us 2.916us 0.000us 0.00% 0.000us 0.000us 21 + cudaLaunchKernel 1.18% 86.372us 1.18% 86.372us 7.198us 0.000us 0.00% 0.000us 0.000us 12 + cudaStreamIsCapturing 0.03% 2.340us 0.03% 2.340us 0.780us 0.000us 0.00% 0.000us 0.000us 3 + cudaFuncSetAttribute 0.05% 3.500us 0.05% 3.500us 1.167us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 72.16% 5.297ms 72.16% 5.297ms 5.297ms 0.000us 0.00% 0.000us 0.000us 1 +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ +Self CPU time total: 7.341ms +Self CUDA time total: 5.848ms + + + +====================================================================== +PROFILE TRACE: torch_mem_eff | cuda_attn_L320_bfloat16 +====================================================================== +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ + Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ + torch_mem_eff 2.94% 229.483us 29.69% 2.318ms 2.318ms 0.000us 0.00% 6.099ms 6.099ms 1 + torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.049ms 100.14% 6.049ms 6.049ms 1 + aten::scaled_dot_product_attention 0.23% 17.971us 1.79% 139.464us 46.488us 0.000us 0.00% 5.384ms 1.795ms 3 + aten::_scaled_dot_product_efficient_attention 0.23% 18.090us 1.56% 121.493us 40.498us 0.000us 0.00% 5.384ms 1.795ms 3 + aten::_efficient_attention_forward 0.36% 27.830us 1.04% 80.963us 26.988us 5.384ms 89.13% 5.384ms 1.795ms 3 +fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.384ms 89.13% 5.384ms 1.795ms 3 + aten::contiguous 0.09% 7.278us 24.41% 1.906ms 211.734us 0.000us 0.00% 714.652us 79.406us 9 + aten::clone 0.28% 21.781us 24.31% 1.898ms 210.925us 0.000us 0.00% 714.652us 79.406us 9 + aten::copy_ 0.80% 62.662us 23.36% 1.824ms 202.683us 656.540us 10.87% 714.652us 79.406us 9 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 656.540us 10.87% 656.540us 72.949us 9 + Activity Buffer Request 21.74% 1.697ms 21.74% 1.697ms 1.697ms 58.112us 0.96% 58.112us 58.112us 1 + aten::transpose 0.63% 48.810us 0.84% 65.850us 2.744us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.22% 17.040us 0.22% 17.040us 0.710us 0.000us 0.00% 0.000us 0.000us 24 + aten::empty_like 0.14% 11.161us 0.67% 52.392us 5.821us 0.000us 0.00% 0.000us 0.000us 9 + aten::empty 0.87% 67.583us 0.87% 67.583us 3.218us 0.000us 0.00% 0.000us 0.000us 21 + cudaLaunchKernel 1.09% 85.261us 1.09% 85.261us 7.105us 0.000us 0.00% 0.000us 0.000us 12 + cudaStreamIsCapturing 0.03% 2.451us 0.03% 2.451us 0.817us 0.000us 0.00% 0.000us 0.000us 3 + cudaFuncSetAttribute 0.04% 3.290us 0.04% 3.290us 1.097us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 70.31% 5.490ms 70.31% 5.490ms 5.490ms 0.000us 0.00% 0.000us 0.000us 1 +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ +Self CPU time total: 7.808ms +Self CUDA time total: 6.041ms + + + +====================================================================== +PROFILE TRACE: torch_mem_eff | cuda_attn_L384_bfloat16 +====================================================================== +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ + Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ + torch_mem_eff 2.96% 232.645us 28.95% 2.277ms 2.277ms 0.000us 0.00% 6.207ms 6.207ms 1 + torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.157ms 100.14% 6.157ms 6.157ms 1 + aten::scaled_dot_product_attention 0.23% 18.052us 1.76% 138.596us 46.199us 0.000us 0.00% 5.492ms 1.831ms 3 + aten::_scaled_dot_product_efficient_attention 0.23% 17.731us 1.53% 120.544us 40.181us 0.000us 0.00% 5.492ms 1.831ms 3 + aten::_efficient_attention_forward 0.35% 27.329us 1.02% 80.113us 26.704us 5.492ms 89.32% 5.492ms 1.831ms 3 +fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.492ms 89.32% 5.492ms 1.831ms 3 + aten::contiguous 0.09% 7.269us 23.67% 1.862ms 206.848us 0.000us 0.00% 714.624us 79.403us 9 + aten::clone 0.28% 21.997us 23.58% 1.854ms 206.041us 0.000us 0.00% 714.624us 79.403us 9 + aten::copy_ 0.89% 69.616us 22.61% 1.779ms 197.614us 656.513us 10.68% 714.624us 79.403us 9 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 656.513us 10.68% 656.513us 72.946us 9 + Activity Buffer Request 17.99% 1.415ms 17.99% 1.415ms 1.415ms 58.111us 0.95% 58.111us 58.111us 1 + aten::transpose 0.63% 49.422us 0.84% 66.332us 2.764us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.22% 16.910us 0.22% 16.910us 0.705us 0.000us 0.00% 0.000us 0.000us 24 + aten::empty_like 0.15% 11.593us 0.68% 53.843us 5.983us 0.000us 0.00% 0.000us 0.000us 9 + aten::empty 0.87% 68.381us 0.87% 68.381us 3.256us 0.000us 0.00% 0.000us 0.000us 21 + cudaLaunchKernel 4.00% 314.941us 4.00% 314.941us 26.245us 0.000us 0.00% 0.000us 0.000us 12 + cudaStreamIsCapturing 0.03% 2.380us 0.03% 2.380us 0.793us 0.000us 0.00% 0.000us 0.000us 3 + cudaFuncSetAttribute 0.04% 3.242us 0.04% 3.242us 1.081us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 71.05% 5.588ms 71.05% 5.588ms 5.588ms 0.000us 0.00% 0.000us 0.000us 1 +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ +Self CPU time total: 7.865ms +Self CUDA time total: 6.149ms + + + +====================================================================== +PROFILE TRACE: torch_mem_eff | cuda_attn_L448_bfloat16 +====================================================================== +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ + Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ + torch_mem_eff 2.91% 232.917us 28.19% 2.257ms 2.257ms 0.000us 0.00% 6.364ms 6.364ms 1 + torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.313ms 100.13% 6.313ms 6.313ms 1 + aten::scaled_dot_product_attention 0.22% 17.912us 1.77% 142.075us 47.358us 0.000us 0.00% 5.641ms 1.880ms 3 + aten::_scaled_dot_product_efficient_attention 0.23% 18.730us 1.55% 124.163us 41.388us 0.000us 0.00% 5.641ms 1.880ms 3 + aten::_efficient_attention_forward 0.36% 29.090us 1.02% 81.873us 27.291us 5.641ms 89.47% 5.641ms 1.880ms 3 +fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.641ms 89.47% 5.641ms 1.880ms 3 + aten::contiguous 0.09% 7.221us 22.98% 1.840ms 204.428us 0.000us 0.00% 723.455us 80.384us 9 + aten::clone 0.27% 21.690us 22.89% 1.833ms 203.626us 0.000us 0.00% 723.455us 80.384us 9 + aten::copy_ 0.78% 62.812us 21.99% 1.761ms 195.631us 663.839us 10.53% 723.455us 80.384us 9 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 663.839us 10.53% 663.839us 73.760us 9 + Activity Buffer Request 18.37% 1.471ms 18.37% 1.471ms 1.471ms 59.616us 0.95% 59.616us 59.616us 1 + aten::transpose 0.60% 48.283us 0.82% 65.922us 2.747us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.22% 17.639us 0.22% 17.639us 0.735us 0.000us 0.00% 0.000us 0.000us 24 + aten::empty_like 0.15% 11.816us 0.63% 50.264us 5.585us 0.000us 0.00% 0.000us 0.000us 9 + aten::empty 0.80% 63.840us 0.80% 63.840us 3.040us 0.000us 0.00% 0.000us 0.000us 21 + cudaLaunchKernel 3.11% 249.257us 3.11% 249.257us 20.771us 0.000us 0.00% 0.000us 0.000us 12 + cudaStreamIsCapturing 0.03% 2.260us 0.03% 2.260us 0.753us 0.000us 0.00% 0.000us 0.000us 3 + cudaFuncSetAttribute 0.04% 3.100us 0.04% 3.100us 1.033us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 71.81% 5.750ms 71.81% 5.750ms 5.750ms 0.000us 0.00% 0.000us 0.000us 1 +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ +Self CPU time total: 8.007ms +Self CUDA time total: 6.304ms + + + +====================================================================== +PROFILE TRACE: torch_mem_eff | cuda_attn_L512_bfloat16 +====================================================================== +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ + Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ + torch_mem_eff 3.10% 262.407us 28.45% 2.407ms 2.407ms 0.000us 0.00% 6.700ms 6.700ms 1 + torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.648ms 100.13% 6.648ms 6.648ms 1 + aten::scaled_dot_product_attention 0.22% 18.361us 1.72% 145.216us 48.405us 0.000us 0.00% 5.968ms 1.989ms 3 + aten::_scaled_dot_product_efficient_attention 0.22% 18.717us 1.50% 126.855us 42.285us 0.000us 0.00% 5.968ms 1.989ms 3 + aten::_efficient_attention_forward 0.34% 29.081us 1.00% 84.393us 28.131us 5.968ms 89.89% 5.968ms 1.989ms 3 +fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.968ms 89.89% 5.968ms 1.989ms 3 + aten::contiguous 0.09% 7.641us 23.04% 1.949ms 216.566us 0.000us 0.00% 731.964us 81.329us 9 + aten::clone 0.29% 24.377us 22.95% 1.941ms 215.717us 0.000us 0.00% 731.964us 81.329us 9 + aten::copy_ 0.80% 68.015us 22.01% 1.862ms 206.906us 670.941us 10.11% 731.964us 81.329us 9 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 670.941us 10.11% 670.941us 74.549us 9 + Activity Buffer Request 17.04% 1.441ms 17.04% 1.441ms 1.441ms 61.023us 0.92% 61.023us 61.023us 1 + aten::transpose 0.67% 56.417us 0.87% 73.607us 3.067us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.20% 17.190us 0.20% 17.190us 0.716us 0.000us 0.00% 0.000us 0.000us 24 + aten::empty_like 0.14% 12.051us 0.65% 54.922us 6.102us 0.000us 0.00% 0.000us 0.000us 9 + aten::empty 0.83% 69.821us 0.83% 69.821us 3.325us 0.000us 0.00% 0.000us 0.000us 21 + cudaLaunchKernel 4.44% 375.855us 4.44% 375.855us 31.321us 0.000us 0.00% 0.000us 0.000us 12 + cudaStreamIsCapturing 0.03% 2.230us 0.03% 2.230us 0.743us 0.000us 0.00% 0.000us 0.000us 3 + cudaFuncSetAttribute 0.04% 3.250us 0.04% 3.250us 1.083us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 71.55% 6.053ms 71.55% 6.053ms 6.053ms 0.000us 0.00% 0.000us 0.000us 1 +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ +Self CPU time total: 8.459ms +Self CUDA time total: 6.639ms + + +impl wl p50(ms) ok +torch_mem_eff cuda_attn_L128_bfloat16 1.86 True +torch_mem_eff cuda_attn_L256_bfloat16 1.99 True +torch_mem_eff cuda_attn_L320_bfloat16 2.02 True +torch_mem_eff cuda_attn_L384_bfloat16 2.04 True +torch_mem_eff cuda_attn_L448_bfloat16 2.06 True +torch_mem_eff cuda_attn_L512_bfloat16 2.22 True +
+
+▶ UV Install Logs
+
+
+
+