Spaces:

kernels-community
/

kernels-benchmarks

Running

drbh HF Staff

Upload folder using huggingface_hub

0cce993 verified 3 months ago

1.71 kB

	# /// script
	# requires-python = ">=3.10"
	# dependencies = [
	# "numpy",
	# "torch",
	# "kernels-benchmark-tools",
	# "xformers",
	# ]
	#
	# [tool.uv.sources]
	# kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }
	# ///
	import torch
	import sys
	import os
	import kernels_benchmark_tools as kbt
	import xformers.ops as xops


	def xformers_attention(q, k, v):
	"""xFormers memory efficient attention"""
	# xFormers expects [batch, seq_len, heads, head_dim]
	return xops.memory_efficient_attention(q, k, v)


	kbt.add(
	"xformers_meff",
	xformers_attention,
	tags={"family": "xformers", "backend": "memory_efficient", "compile": "none"},
	)

	if __name__ == "__main__":
	device = "cuda" if torch.cuda.is_available() else "cpu"
	dtype = "float32" if device == "cpu" else "bfloat16"

	# Flux-like workloads
	base = 1024 if device == "cuda" else 512
	flux_sizes = (
	[128, 256, 320, 384, 448, 512] if device == "cuda" else [64, 128, 192, 256]
	)
	heads = 24 if device == "cuda" else 8
	head_dim = 128 if device == "cuda" else 64

	wl = []
	for L in flux_sizes:
	wl.append(
	{
	"name": f"flux_L{L}",
	"batch": 1,
	"seq_len": base + L,
	"heads": heads,
	"head_dim": head_dim,
	"dtype": dtype,
	"device": device,
	"seed": 0,
	}
	)

	kbt.run(
	wl,
	jsonl="attn.jsonl",
	reps=5,
	warmup=2,
	gen=kbt.attn.gen_qkv,
	ref=kbt.attn.ref_math,
	cmp=kbt.attn.cmp_allclose,
	)
	kbt.summarize(["attn.jsonl"])