+
+▼ code
+▼ output
+ ▶ uv-logs
+ |
+Cell: benchmark | 35.60s
+ |
+
+Raw
+
+
+
+
+
+
+
+
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# "numpy",
+# "torch",
+# "kernels-benchmark-tools",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }
+# ///
+import torch
+import sys
+import os
+import kernels_benchmark_tools as kbt
+
+
+def torch_mem_eff(q, k, v):
+ qt, kt, vt = (x.transpose(1, 2).contiguous() for x in (q, k, v))
+ with torch.nn.attention.sdpa_kernel(
+ torch.nn.attention.SDPBackend.EFFICIENT_ATTENTION
+ ):
+ o = torch.nn.functional.scaled_dot_product_attention(qt, kt, vt)
+ return o.transpose(1, 2).contiguous()
+
+kbt.add(
+ "torch_mem_eff",
+ torch_mem_eff,
+ tags={"family": "torch-sdpa", "backend": "EFFICIENT", "compile": "none"},
+)
+
+if __name__ == "__main__":
+ device = "cuda" if torch.cuda.is_available() else "cpu"
+ dtype = "float32" if device == "cpu" else "bfloat16"
+
+ # Flux-like workloads scaled down for CPU testing
+ base = 1024 if device == "cuda" else 512
+ flux_sizes = (
+ [128, 256, 320, 384, 448, 512] if device == "cuda" else [64, 128, 192, 256]
+ )
+ heads = 24 if device == "cuda" else 8
+ head_dim = 128 if device == "cuda" else 64
+
+ wl = []
+ for L in flux_sizes:
+ wl.append(
+ {
+ "name": f"flux_L{L}",
+ "batch": 1,
+ "seq_len": base + L,
+ "heads": heads,
+ "head_dim": head_dim,
+ "dtype": dtype,
+ "device": device,
+ "seed": 0,
+ }
+ )
+
+ kbt.run(
+ wl,
+ jsonl="attn.jsonl",
+ reps=5,
+ warmup=2,
+ gen=kbt.attn.gen_qkv,
+ ref=kbt.attn.ref_math,
+ cmp=kbt.attn.cmp_allclose,
+ )
+ kbt.summarize(["attn.jsonl"])
+
+
+impl wl p50(ms) ok
+torch_mem_eff flux_L128 0.59 True
+torch_mem_eff flux_L256 0.65 True
+torch_mem_eff flux_L320 0.77 True
+torch_mem_eff flux_L384 0.79 True
+torch_mem_eff flux_L448 0.84 True
+torch_mem_eff flux_L512 0.95 True
+
+
+
+▶ UV Install Logs
+
+
+
+