| # /// script | |
| # requires-python = ">=3.10" | |
| # dependencies = [ | |
| # "numpy", | |
| # "torch", | |
| # "kernels-benchmark-tools", | |
| # "xformers", | |
| # ] | |
| # | |
| # [tool.uv.sources] | |
| # kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" } | |
| # /// | |
| import torch | |
| import sys | |
| import os | |
| import kernels_benchmark_tools as kbt | |
| import xformers.ops as xops | |
| def xformers_attention(q, k, v): | |
| """xFormers memory efficient attention""" | |
| # xFormers expects [batch, seq_len, heads, head_dim] | |
| return xops.memory_efficient_attention(q, k, v) | |
| kbt.add( | |
| "xformers_meff", | |
| xformers_attention, | |
| tags={"family": "xformers", "backend": "memory_efficient", "compile": "none"}, | |
| ) | |
| if __name__ == "__main__": | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| dtype = "float32" if device == "cpu" else "bfloat16" | |
| # Flux-like workloads | |
| base = 1024 if device == "cuda" else 512 | |
| flux_sizes = ( | |
| [128, 256, 320, 384, 448, 512] if device == "cuda" else [64, 128, 192, 256] | |
| ) | |
| heads = 24 if device == "cuda" else 8 | |
| head_dim = 128 if device == "cuda" else 64 | |
| wl = [] | |
| for L in flux_sizes: | |
| wl.append( | |
| { | |
| "name": f"flux_L{L}", | |
| "batch": 1, | |
| "seq_len": base + L, | |
| "heads": heads, | |
| "head_dim": head_dim, | |
| "dtype": dtype, | |
| "device": device, | |
| "seed": 0, | |
| } | |
| ) | |
| kbt.run( | |
| wl, | |
| jsonl="attn.jsonl", | |
| reps=5, | |
| warmup=2, | |
| gen=kbt.attn.gen_qkv, | |
| ref=kbt.attn.ref_math, | |
| cmp=kbt.attn.cmp_allclose, | |
| ) | |
| kbt.summarize(["attn.jsonl"]) |