# /// script
# requires-python = ">=3.10"
# dependencies = [
# "numpy",
# "torch==2.8.0",
# "kernels-benchmark-tools",
# "xformers",
# ]
#
# [tool.uv.sources]
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
# ///
import torch
import sys
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
import xformers.ops as xops
def xformers_attention(q, k, v):
"""xFormers memory efficient attention"""
# xFormers expects [batch, seq_len, heads, head_dim]
return xops.memory_efficient_attention(q, k, v)
run_benchmark(
kernel_type=KernelTypeEnum.ATTENTION,
impl_name="xformers_meff",
impl_tags={"family": "xformers", "backend": "memory_efficient", "compile": "none"},
impl_func=xformers_attention,
)
Running attention benchmark on cuda with 6 workloads.
======================================================================
PROFILE TRACE: xformers_meff | cuda_attn_L128_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
xformers_meff 9.93% 451.937us 49.71% 2.262ms 2.262ms 0.000us 0.00% 3.695ms 3.695ms 1
xformers_flash3::flash_fwd 4.26% 193.656us 38.96% 1.773ms 590.904us 0.000us 0.00% 3.695ms 1.232ms 3
flash_attn_3::fwd 1.62% 73.841us 34.71% 1.579ms 526.352us 2.795ms 100.00% 3.695ms 1.232ms 3
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.797ms 100.05% 2.797ms 2.797ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.795ms 100.00% 2.795ms 931.773us 3
Activity Buffer Request 31.17% 1.418ms 31.17% 1.418ms 1.418ms 899.421us 32.18% 899.421us 899.421us 1
aten::empty 0.76% 34.741us 0.76% 34.741us 5.790us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.30% 13.732us 0.30% 13.732us 4.577us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.85% 38.662us 0.85% 38.662us 12.887us 0.000us 0.00% 0.000us 0.000us 3
aten::reshape 0.35% 15.860us 0.82% 37.181us 6.197us 0.000us 0.00% 0.000us 0.000us 6
aten::view 0.47% 21.321us 0.47% 21.321us 3.553us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 50.29% 2.288ms 50.29% 2.288ms 2.288ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.550ms
Self CUDA time total: 2.795ms
======================================================================
PROFILE TRACE: xformers_meff | cuda_attn_L256_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
xformers_meff 6.95% 312.321us 44.96% 2.021ms 2.021ms 0.000us 0.00% 3.832ms 3.832ms 1
xformers_flash3::flash_fwd 3.14% 141.315us 37.51% 1.686ms 561.970us 0.000us 0.00% 3.832ms 1.277ms 3
flash_attn_3::fwd 1.18% 53.030us 34.37% 1.545ms 514.865us 2.890ms 100.00% 3.832ms 1.277ms 3
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.892ms 100.05% 2.892ms 2.892ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.890ms 100.00% 2.890ms 963.329us 3
Activity Buffer Request 31.64% 1.422ms 31.64% 1.422ms 1.422ms 942.465us 32.61% 942.465us 942.465us 1
aten::empty 0.68% 30.660us 0.68% 30.660us 5.110us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.12% 5.592us 0.12% 5.592us 1.864us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.74% 33.432us 0.74% 33.432us 11.144us 0.000us 0.00% 0.000us 0.000us 3
aten::reshape 0.20% 8.951us 0.50% 22.691us 3.782us 0.000us 0.00% 0.000us 0.000us 6
aten::view 0.31% 13.740us 0.31% 13.740us 2.290us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 55.04% 2.474ms 55.04% 2.474ms 2.474ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.495ms
Self CUDA time total: 2.890ms
======================================================================
PROFILE TRACE: xformers_meff | cuda_attn_L320_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
xformers_meff 6.65% 298.008us 44.73% 2.006ms 2.006ms 0.000us 0.00% 3.867ms 3.867ms 1
xformers_flash3::flash_fwd 3.15% 141.235us 37.58% 1.685ms 561.690us 0.000us 0.00% 3.867ms 1.289ms 3
flash_attn_3::fwd 1.18% 53.120us 34.43% 1.544ms 514.611us 2.888ms 100.00% 3.867ms 1.289ms 3
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.890ms 100.06% 2.890ms 2.890ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.888ms 100.00% 2.888ms 962.683us 3
Activity Buffer Request 31.72% 1.422ms 31.72% 1.422ms 1.422ms 978.939us 33.90% 978.939us 978.939us 1
aten::empty 0.67% 30.192us 0.67% 30.192us 5.032us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.12% 5.491us 0.12% 5.491us 1.830us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.73% 32.901us 0.73% 32.901us 10.967us 0.000us 0.00% 0.000us 0.000us 3
aten::reshape 0.20% 8.773us 0.50% 22.603us 3.767us 0.000us 0.00% 0.000us 0.000us 6
aten::view 0.31% 13.830us 0.31% 13.830us 2.305us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 55.27% 2.478ms 55.27% 2.478ms 2.478ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.484ms
Self CUDA time total: 2.888ms
======================================================================
PROFILE TRACE: xformers_meff | cuda_attn_L384_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
xformers_meff 6.31% 299.042us 46.56% 2.205ms 2.205ms 0.000us 0.00% 3.936ms 3.936ms 1
xformers_flash3::flash_fwd 2.97% 140.784us 39.75% 1.883ms 627.609us 0.000us 0.00% 3.936ms 1.312ms 3
flash_attn_3::fwd 1.10% 52.191us 36.78% 1.742ms 580.681us 2.941ms 100.00% 3.936ms 1.312ms 3
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.943ms 100.05% 2.943ms 2.943ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.941ms 100.00% 2.941ms 980.445us 3
Activity Buffer Request 30.11% 1.426ms 30.11% 1.426ms 1.426ms 994.973us 33.83% 994.973us 994.973us 1
aten::empty 0.64% 30.333us 0.64% 30.333us 5.055us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.11% 5.440us 0.11% 5.440us 1.813us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 4.81% 227.898us 4.81% 227.898us 75.966us 0.000us 0.00% 0.000us 0.000us 3
aten::reshape 0.19% 8.769us 0.49% 23.220us 3.870us 0.000us 0.00% 0.000us 0.000us 6
aten::view 0.31% 14.451us 0.31% 14.451us 2.409us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 53.44% 2.531ms 53.44% 2.531ms 2.531ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.736ms
Self CUDA time total: 2.941ms
======================================================================
PROFILE TRACE: xformers_meff | cuda_attn_L448_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
xformers_meff 5.82% 299.962us 41.73% 2.152ms 2.152ms 0.000us 0.00% 4.566ms 4.566ms 1
xformers_flash3::flash_fwd 2.76% 142.114us 35.47% 1.829ms 609.751us 0.000us 0.00% 4.566ms 1.522ms 3
flash_attn_3::fwd 1.04% 53.631us 32.71% 1.687ms 562.380us 3.419ms 100.00% 4.566ms 1.522ms 3
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.420ms 100.05% 3.420ms 3.420ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.419ms 100.00% 3.419ms 1.140ms 3
Activity Buffer Request 27.56% 1.422ms 27.56% 1.422ms 1.422ms 1.148ms 33.58% 1.148ms 1.148ms 1
aten::empty 0.60% 31.172us 0.60% 31.172us 5.195us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.11% 5.431us 0.11% 5.431us 1.810us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 3.40% 175.366us 3.40% 175.366us 58.455us 0.000us 0.00% 0.000us 0.000us 3
aten::reshape 0.17% 8.849us 0.45% 23.030us 3.838us 0.000us 0.00% 0.000us 0.000us 6
aten::view 0.27% 14.181us 0.27% 14.181us 2.363us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 58.27% 3.005ms 58.27% 3.005ms 3.005ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 5.157ms
Self CUDA time total: 3.419ms
======================================================================
PROFILE TRACE: xformers_meff | cuda_attn_L512_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
xformers_meff 5.76% 295.800us 41.67% 2.139ms 2.139ms 0.000us 0.00% 4.557ms 4.557ms 1
xformers_flash3::flash_fwd 2.75% 141.044us 35.47% 1.821ms 606.924us 0.000us 0.00% 4.557ms 1.519ms 3
flash_attn_3::fwd 1.04% 53.523us 32.72% 1.680ms 559.910us 3.405ms 100.00% 4.557ms 1.519ms 3
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.407ms 100.05% 3.407ms 3.407ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.405ms 100.00% 3.405ms 1.135ms 3
Activity Buffer Request 27.67% 1.420ms 27.67% 1.420ms 1.420ms 1.152ms 33.82% 1.152ms 1.152ms 1
aten::empty 0.60% 30.610us 0.60% 30.610us 5.102us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.12% 6.310us 0.12% 6.310us 2.103us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 3.29% 168.946us 3.29% 168.946us 56.315us 0.000us 0.00% 0.000us 0.000us 3
aten::reshape 0.17% 8.721us 0.44% 22.392us 3.732us 0.000us 0.00% 0.000us 0.000us 6
aten::view 0.27% 13.671us 0.27% 13.671us 2.279us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 58.33% 2.994ms 58.33% 2.994ms 2.994ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 5.133ms
Self CUDA time total: 3.405ms
impl wl p50(ms) ok
xformers_meff cuda_attn_L128_bfloat16 0.98 True
xformers_meff cuda_attn_L256_bfloat16 1.03 True
xformers_meff cuda_attn_L320_bfloat16 1.08 True
xformers_meff cuda_attn_L384_bfloat16 1.10 True
xformers_meff cuda_attn_L448_bfloat16 1.23 True
xformers_meff cuda_attn_L512_bfloat16 1.22 True
▶ UV Install Logs