# /// script
# requires-python = ">=3.10"
# dependencies = [
# "numpy",
# "torch==2.8.0",
# "kernels-benchmark-tools",
# "xformers",
# ]
#
# [tool.uv.sources]
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
# ///
import torch
import sys
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
import xformers.ops as xops
def xformers_attention(q, k, v):
"""xFormers memory efficient attention"""
# xFormers expects [batch, seq_len, heads, head_dim]
return xops.memory_efficient_attention(q, k, v)
run_benchmark(
kernel_type=KernelTypeEnum.ATTENTION,
impl_name="xformers_meff",
impl_tags={"family": "xformers", "backend": "memory_efficient", "compile": "none"},
impl_func=xformers_attention,
)
Running attention benchmark on cuda with 6 workloads.
======================================================================
PROFILE TRACE: xformers_meff | cuda_attn_L128_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
xformers_meff 9.60% 449.460us 54.45% 2.550ms 2.550ms 0.000us 0.00% 3.540ms 3.540ms 1
xformers_flash3::flash_fwd 4.00% 187.356us 44.14% 2.067ms 689.137us 0.000us 0.00% 3.540ms 1.180ms 3
flash_attn_3::fwd 1.48% 69.234us 40.14% 1.880ms 626.685us 2.646ms 100.00% 3.540ms 1.180ms 3
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.648ms 100.06% 2.648ms 2.648ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.646ms 100.00% 2.646ms 882.010us 3
Activity Buffer Request 36.74% 1.721ms 36.74% 1.721ms 1.721ms 894.309us 33.80% 894.309us 894.309us 1
aten::empty 0.73% 34.410us 0.73% 34.410us 5.735us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.25% 11.780us 0.25% 11.780us 3.927us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.93% 43.670us 0.93% 43.670us 14.557us 0.000us 0.00% 0.000us 0.000us 3
aten::reshape 0.24% 11.301us 0.72% 33.571us 5.595us 0.000us 0.00% 0.000us 0.000us 6
aten::view 0.48% 22.270us 0.48% 22.270us 3.712us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 45.55% 2.133ms 45.55% 2.133ms 2.133ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.684ms
Self CUDA time total: 2.646ms
======================================================================
PROFILE TRACE: xformers_meff | cuda_attn_L256_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
xformers_meff 6.53% 314.780us 50.80% 2.448ms 2.448ms 0.000us 0.00% 3.745ms 3.745ms 1
xformers_flash3::flash_fwd 2.99% 144.051us 43.78% 2.110ms 703.226us 0.000us 0.00% 3.745ms 1.248ms 3
flash_attn_3::fwd 1.06% 51.161us 40.79% 1.966ms 655.209us 2.793ms 100.00% 3.745ms 1.248ms 3
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.795ms 100.06% 2.795ms 2.795ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.793ms 100.00% 2.793ms 931.037us 3
Activity Buffer Request 38.27% 1.844ms 38.27% 1.844ms 1.844ms 952.158us 34.09% 952.158us 952.158us 1
aten::empty 0.59% 28.641us 0.59% 28.641us 4.774us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.11% 5.380us 0.11% 5.380us 1.793us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.75% 36.051us 0.75% 36.051us 12.017us 0.000us 0.00% 0.000us 0.000us 3
aten::reshape 0.19% 9.170us 0.49% 23.510us 3.918us 0.000us 0.00% 0.000us 0.000us 6
aten::view 0.30% 14.340us 0.30% 14.340us 2.390us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 49.20% 2.371ms 49.20% 2.371ms 2.371ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.819ms
Self CUDA time total: 2.793ms
======================================================================
PROFILE TRACE: xformers_meff | cuda_attn_L320_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
xformers_meff 6.41% 306.378us 48.23% 2.306ms 2.306ms 0.000us 0.00% 3.879ms 3.879ms 1
xformers_flash3::flash_fwd 2.97% 141.954us 41.36% 1.977ms 659.046us 0.000us 0.00% 3.879ms 1.293ms 3
flash_attn_3::fwd 1.09% 51.910us 38.39% 1.835ms 611.728us 2.892ms 100.00% 3.879ms 1.293ms 3
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.893ms 100.06% 2.893ms 2.893ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.892ms 100.00% 2.892ms 963.882us 3
Activity Buffer Request 35.83% 1.713ms 35.83% 1.713ms 1.713ms 986.975us 34.13% 986.975us 986.975us 1
aten::empty 0.60% 28.840us 0.60% 28.840us 4.807us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.11% 5.330us 0.11% 5.330us 1.777us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.75% 36.082us 0.75% 36.082us 12.027us 0.000us 0.00% 0.000us 0.000us 3
aten::reshape 0.17% 8.059us 0.47% 22.400us 3.733us 0.000us 0.00% 0.000us 0.000us 6
aten::view 0.30% 14.341us 0.30% 14.341us 2.390us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 51.77% 2.475ms 51.77% 2.475ms 2.475ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.781ms
Self CUDA time total: 2.892ms
======================================================================
PROFILE TRACE: xformers_meff | cuda_attn_L384_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
xformers_meff 6.14% 305.279us 50.00% 2.487ms 2.487ms 0.000us 0.00% 3.889ms 3.889ms 1
xformers_flash3::flash_fwd 2.94% 146.052us 43.42% 2.159ms 719.674us 0.000us 0.00% 3.889ms 1.296ms 3
flash_attn_3::fwd 1.05% 52.012us 40.48% 2.013ms 670.990us 2.906ms 100.00% 3.889ms 1.296ms 3
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.907ms 100.06% 2.907ms 2.907ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.906ms 100.00% 2.906ms 968.605us 3
Activity Buffer Request 34.76% 1.728ms 34.76% 1.728ms 1.728ms 983.453us 33.84% 983.453us 983.453us 1
aten::empty 0.63% 31.322us 0.63% 31.322us 5.220us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.11% 5.389us 0.11% 5.389us 1.796us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 3.94% 195.844us 3.94% 195.844us 65.281us 0.000us 0.00% 0.000us 0.000us 3
aten::reshape 0.17% 8.560us 0.45% 22.331us 3.722us 0.000us 0.00% 0.000us 0.000us 6
aten::view 0.28% 13.771us 0.28% 13.771us 2.295us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 50.00% 2.486ms 50.00% 2.486ms 2.486ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.973ms
Self CUDA time total: 2.906ms
======================================================================
PROFILE TRACE: xformers_meff | cuda_attn_L448_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
xformers_meff 5.54% 306.968us 45.05% 2.496ms 2.496ms 0.000us 0.00% 4.618ms 4.618ms 1
xformers_flash3::flash_fwd 2.62% 145.024us 39.11% 2.167ms 722.434us 0.000us 0.00% 4.618ms 1.539ms 3
flash_attn_3::fwd 0.92% 51.181us 36.50% 2.022ms 674.093us 3.463ms 100.00% 4.618ms 1.539ms 3
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.465ms 100.05% 3.465ms 3.465ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.463ms 100.00% 3.463ms 1.154ms 3
Activity Buffer Request 31.42% 1.741ms 31.42% 1.741ms 1.741ms 1.155ms 33.34% 1.155ms 1.155ms 1
aten::empty 0.54% 29.990us 0.54% 29.990us 4.998us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.10% 5.350us 0.10% 5.350us 1.783us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 3.51% 194.715us 3.51% 194.715us 64.905us 0.000us 0.00% 0.000us 0.000us 3
aten::reshape 0.15% 8.420us 0.40% 22.040us 3.673us 0.000us 0.00% 0.000us 0.000us 6
aten::view 0.25% 13.620us 0.25% 13.620us 2.270us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 54.95% 3.045ms 54.95% 3.045ms 3.045ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 5.541ms
Self CUDA time total: 3.463ms
======================================================================
PROFILE TRACE: xformers_meff | cuda_attn_L512_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
xformers_meff 5.16% 304.966us 48.93% 2.893ms 2.893ms 0.000us 0.00% 4.598ms 4.598ms 1
xformers_flash3::flash_fwd 9.37% 553.844us 43.37% 2.564ms 854.584us 0.000us 0.00% 4.598ms 1.533ms 3
flash_attn_3::fwd 0.88% 52.300us 34.00% 2.010ms 669.970us 3.443ms 100.00% 4.598ms 1.533ms 3
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.445ms 100.05% 3.445ms 3.445ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.443ms 100.00% 3.443ms 1.148ms 3
Activity Buffer Request 28.71% 1.697ms 28.71% 1.697ms 1.697ms 1.155ms 33.53% 1.155ms 1.155ms 1
aten::empty 0.52% 30.653us 0.52% 30.653us 5.109us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.09% 5.400us 0.09% 5.400us 1.800us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 3.80% 224.365us 3.80% 224.365us 74.788us 0.000us 0.00% 0.000us 0.000us 3
aten::reshape 0.15% 8.918us 0.40% 23.921us 3.987us 0.000us 0.00% 0.000us 0.000us 6
aten::view 0.25% 15.003us 0.25% 15.003us 2.501us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 51.07% 3.019ms 51.07% 3.019ms 3.019ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 5.912ms
Self CUDA time total: 3.443ms
impl wl p50(ms) ok
xformers_meff cuda_attn_L128_bfloat16 0.98 True
xformers_meff cuda_attn_L256_bfloat16 1.04 True
xformers_meff cuda_attn_L320_bfloat16 1.06 True
xformers_meff cuda_attn_L384_bfloat16 1.09 True
xformers_meff cuda_attn_L448_bfloat16 1.26 True
xformers_meff cuda_attn_L512_bfloat16 1.24 True
▶ UV Install Logs