# /// script
# requires-python = ">=3.10"
# dependencies = [
# "numpy",
# "torch==2.8.0",
# "kernels-benchmark-tools",
# "xformers",
# ]
#
# [tool.uv.sources]
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
# ///
import torch
import sys
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
import xformers.ops as xops
def xformers_attention(q, k, v):
"""xFormers memory efficient attention"""
# xFormers expects [batch, seq_len, heads, head_dim]
return xops.memory_efficient_attention(q, k, v)
run_benchmark(
kernel_type=KernelTypeEnum.ATTENTION,
impl_name="xformers_meff",
impl_tags={"family": "xformers", "backend": "memory_efficient", "compile": "none"},
impl_func=xformers_attention,
)
Running attention benchmark on cuda with 6 workloads.
======================================================================
PROFILE TRACE: xformers_meff | cuda_attn_L128_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
xformers_meff 9.64% 463.299us 53.77% 2.584ms 2.584ms 0.000us 0.00% 3.636ms 3.636ms 1
xformers_flash3::flash_fwd 3.92% 188.192us 43.38% 2.085ms 694.978us 0.000us 0.00% 3.636ms 1.212ms 3
flash_attn_3::fwd 1.40% 67.082us 39.46% 1.897ms 632.248us 2.748ms 100.00% 3.636ms 1.212ms 3
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.749ms 100.05% 2.749ms 2.749ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.748ms 100.00% 2.748ms 915.935us 3
Activity Buffer Request 36.10% 1.735ms 36.10% 1.735ms 1.735ms 887.807us 32.31% 887.807us 887.807us 1
aten::empty 0.82% 39.381us 0.82% 39.381us 6.563us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.26% 12.540us 0.26% 12.540us 4.180us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.88% 42.510us 0.88% 42.510us 14.170us 0.000us 0.00% 0.000us 0.000us 3
aten::reshape 0.25% 12.121us 0.75% 35.870us 5.978us 0.000us 0.00% 0.000us 0.000us 6
aten::view 0.49% 23.749us 0.49% 23.749us 3.958us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 46.23% 2.222ms 46.23% 2.222ms 2.222ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.806ms
Self CUDA time total: 2.748ms
======================================================================
PROFILE TRACE: xformers_meff | cuda_attn_L256_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
xformers_meff 6.94% 327.436us 51.65% 2.436ms 2.436ms 0.000us 0.00% 3.659ms 3.659ms 1
xformers_flash3::flash_fwd 3.29% 155.063us 44.22% 2.085ms 695.085us 0.000us 0.00% 3.659ms 1.220ms 3
flash_attn_3::fwd 1.15% 54.292us 40.93% 1.930ms 643.398us 2.737ms 100.00% 3.659ms 1.220ms 3
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.738ms 100.05% 2.738ms 2.738ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.737ms 100.00% 2.737ms 912.235us 3
Activity Buffer Request 38.21% 1.802ms 38.21% 1.802ms 1.802ms 922.336us 33.70% 922.336us 922.336us 1
aten::empty 0.70% 32.930us 0.70% 32.930us 5.488us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.12% 5.760us 0.12% 5.760us 1.920us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.75% 35.410us 0.75% 35.410us 11.803us 0.000us 0.00% 0.000us 0.000us 3
aten::reshape 0.20% 9.409us 0.49% 22.989us 3.831us 0.000us 0.00% 0.000us 0.000us 6
aten::view 0.29% 13.580us 0.29% 13.580us 2.263us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 48.35% 2.280ms 48.35% 2.280ms 2.280ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.715ms
Self CUDA time total: 2.737ms
======================================================================
PROFILE TRACE: xformers_meff | cuda_attn_L320_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
xformers_meff 6.35% 296.325us 47.95% 2.238ms 2.238ms 0.000us 0.00% 3.787ms 3.787ms 1
xformers_flash3::flash_fwd 2.95% 137.473us 41.12% 1.919ms 639.648us 0.000us 0.00% 3.787ms 1.262ms 3
flash_attn_3::fwd 1.09% 50.850us 38.17% 1.781ms 593.823us 2.829ms 100.00% 3.787ms 1.262ms 3
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.831ms 100.05% 2.831ms 2.831ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.829ms 100.00% 2.829ms 943.127us 3
Activity Buffer Request 35.64% 1.663ms 35.64% 1.663ms 1.663ms 957.186us 33.83% 957.186us 957.186us 1
aten::empty 0.63% 29.301us 0.63% 29.301us 4.884us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.11% 5.090us 0.11% 5.090us 1.697us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.71% 33.151us 0.71% 33.151us 11.050us 0.000us 0.00% 0.000us 0.000us 3
aten::reshape 0.18% 8.531us 0.48% 22.580us 3.763us 0.000us 0.00% 0.000us 0.000us 6
aten::view 0.30% 14.049us 0.30% 14.049us 2.341us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 52.05% 2.429ms 52.05% 2.429ms 2.429ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.667ms
Self CUDA time total: 2.829ms
======================================================================
PROFILE TRACE: xformers_meff | cuda_attn_L384_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
xformers_meff 6.11% 304.138us 50.43% 2.511ms 2.511ms 0.000us 0.00% 3.860ms 3.860ms 1
xformers_flash3::flash_fwd 3.07% 152.860us 43.87% 2.184ms 727.989us 0.000us 0.00% 3.860ms 1.287ms 3
flash_attn_3::fwd 1.07% 53.395us 40.80% 2.031ms 677.035us 2.883ms 100.00% 3.860ms 1.287ms 3
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.885ms 100.05% 2.885ms 2.885ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.883ms 100.00% 2.883ms 961.001us 3
Activity Buffer Request 34.97% 1.741ms 34.97% 1.741ms 1.741ms 977.086us 33.89% 977.086us 977.086us 1
aten::empty 0.66% 32.699us 0.66% 32.699us 5.450us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.12% 6.109us 0.12% 6.109us 2.036us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 3.98% 197.963us 3.98% 197.963us 65.988us 0.000us 0.00% 0.000us 0.000us 3
aten::reshape 0.17% 8.489us 0.45% 22.539us 3.757us 0.000us 0.00% 0.000us 0.000us 6
aten::view 0.28% 14.050us 0.28% 14.050us 2.342us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 49.57% 2.468ms 49.57% 2.468ms 2.468ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.978ms
Self CUDA time total: 2.883ms
======================================================================
PROFILE TRACE: xformers_meff | cuda_attn_L448_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
xformers_meff 5.45% 299.105us 45.26% 2.482ms 2.482ms 0.000us 0.00% 4.556ms 4.556ms 1
xformers_flash3::flash_fwd 2.57% 140.761us 39.42% 2.162ms 720.685us 0.000us 0.00% 4.556ms 1.519ms 3
flash_attn_3::fwd 0.92% 50.555us 36.85% 2.021ms 673.765us 3.406ms 100.00% 4.556ms 1.519ms 3
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.408ms 100.05% 3.408ms 3.408ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.406ms 100.00% 3.406ms 1.135ms 3
Activity Buffer Request 31.74% 1.741ms 31.74% 1.741ms 1.741ms 1.150ms 33.76% 1.150ms 1.150ms 1
aten::empty 0.52% 28.258us 0.52% 28.258us 4.710us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.10% 5.340us 0.10% 5.340us 1.780us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 3.58% 196.453us 3.58% 196.453us 65.484us 0.000us 0.00% 0.000us 0.000us 3
aten::reshape 0.14% 7.863us 0.39% 21.181us 3.530us 0.000us 0.00% 0.000us 0.000us 6
aten::view 0.24% 13.318us 0.24% 13.318us 2.220us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 54.74% 3.003ms 54.74% 3.003ms 3.003ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 5.485ms
Self CUDA time total: 3.406ms
======================================================================
PROFILE TRACE: xformers_meff | cuda_attn_L512_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
xformers_meff 5.08% 273.484us 44.98% 2.423ms 2.423ms 0.000us 0.00% 4.494ms 4.494ms 1
xformers_flash3::flash_fwd 2.55% 137.253us 39.52% 2.129ms 709.536us 0.000us 0.00% 4.494ms 1.498ms 3
flash_attn_3::fwd 0.94% 50.440us 36.97% 1.991ms 663.785us 3.366ms 100.00% 4.494ms 1.498ms 3
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.368ms 100.05% 3.368ms 3.368ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.366ms 100.00% 3.366ms 1.122ms 3
Activity Buffer Request 31.81% 1.713ms 31.81% 1.713ms 1.713ms 1.127ms 33.48% 1.127ms 1.127ms 1
aten::empty 0.56% 30.302us 0.56% 30.302us 5.050us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.10% 5.300us 0.10% 5.300us 1.767us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 3.56% 191.983us 3.56% 191.983us 63.994us 0.000us 0.00% 0.000us 0.000us 3
aten::reshape 0.15% 8.029us 0.39% 20.930us 3.488us 0.000us 0.00% 0.000us 0.000us 6
aten::view 0.24% 12.901us 0.24% 12.901us 2.150us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 55.02% 2.964ms 55.02% 2.964ms 2.964ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 5.387ms
Self CUDA time total: 3.366ms
impl wl p50(ms) ok
xformers_meff cuda_attn_L128_bfloat16 0.98 True
xformers_meff cuda_attn_L256_bfloat16 1.03 True
xformers_meff cuda_attn_L320_bfloat16 1.06 True
xformers_meff cuda_attn_L384_bfloat16 1.06 True
xformers_meff cuda_attn_L448_bfloat16 1.25 True
xformers_meff cuda_attn_L512_bfloat16 1.23 True
▶ UV Install Logs