Uploaded using `kernel-builder`.

1fea443 verified 25 days ago

27.5 kB

	#!/usr/bin/env python3
	"""Benchmark flashrt-qkv-cache-rope against a PyTorch eager postprocess chain."""

	from __future__ import annotations

	import argparse
	import ctypes
	import ctypes.util
	import importlib
	import json
	import math
	import os
	import sys
	from dataclasses import asdict, dataclass
	from pathlib import Path

	import torch


	ROOT = Path(__file__).resolve().parents[2]
	PACKAGE = ROOT / "flashrt-qkv-cache-rope"
	REGISTRATION_INCLUDE = (
	ROOT.parent
	/ "kernels"
	/ "kernel-builder"
	/ "src"
	/ "pyproject"
	/ "templates"
	/ "torch"
	)

	SHAPES = {
	"small": (1, 64, 8, 128),
	"wan_1k": (1, 1024, 24, 128),
	"wan_2520": (1, 2520, 24, 128),
	"wan_4096": (1, 4096, 24, 128),
	"vl_512": (1, 512, 16, 128),
	}
	SHAPE_GROUPS = {
	"smoke": ["small"],
	"headline": ["wan_1k", "wan_2520", "vl_512"],
	"all": list(SHAPES.keys()),
	}


	@dataclass
	class Result:
	shape: str
	batch: int
	seq_len: int
	heads: int
	head_dim: int
	flashrt_us: float
	torch_eager_us: float
	speedup_vs_eager: float
	q_p99_abs: float
	k_p99_abs: float
	q_cosine: float
	k_cosine: float
	status: str


	class SourceOps:
	def __init__(self, namespace: str) -> None:
	self._ops = getattr(torch.ops, namespace)

	def decode_q_norm_rope_stage_bf16(self, q_pre, q_w, cos, sin, eps=1e-6, q_out=None):
	if q_out is None:
	q_out = torch.empty_like(q_pre)
	self._ops.decode_q_norm_rope_stage_bf16(q_pre, q_w, cos, sin, float(eps), q_out)
	return q_out

	def decode_k_norm_rope_kvwrite_bf16(self, k_pre, v_pre, k_w, cos, sin, eps=1e-6, k_out=None, v_out=None):
	if k_out is None:
	k_out = torch.empty_like(k_pre)
	if v_out is None:
	v_out = torch.empty_like(v_pre)
	self._ops.decode_k_norm_rope_kvwrite_bf16(k_pre, v_pre, k_w, cos, sin, float(eps), k_out, v_out)
	return k_out, v_out

	def decode_k_norm_rope_kvwrite_devpos_bf16(self, k_pre, v_pre, k_w, cos, sin, cur_pos, k_cache, v_cache, eps=1e-6):
	self._ops.decode_k_norm_rope_kvwrite_devpos_bf16(k_pre, v_pre, k_w, cos, sin, cur_pos, float(eps), k_cache, v_cache)
	return k_cache, v_cache

	def qkv_split_norm_rope_bf16(
	self, packed, q_w, k_w, freqs_re, freqs_im, heads, head_dim, rope_seq_len=None, eps=1e-6, q_out=None, k_out=None
	):
	if rope_seq_len is None:
	rope_seq_len = packed.shape[1]
	if q_out is None:
	q_out = torch.empty((packed.shape[0], packed.shape[1], heads, head_dim), device=packed.device, dtype=torch.bfloat16)
	if k_out is None:
	k_out = torch.empty_like(q_out)
	self._ops.qkv_split_norm_rope_bf16(
	packed, q_w, k_w, freqs_re, freqs_im, int(heads), int(head_dim),
	int(rope_seq_len), float(eps), q_out, k_out
	)
	return q_out, k_out

	def qkv_split_joint3_cat_bf16(
	self,
	packed_v,
	qkv_v_bias,
	norm_v_q_weight,
	norm_v_k_weight,
	freqs_re,
	freqs_im,
	packed_a,
	norm_a_q_weight,
	norm_a_k_weight,
	packed_u,
	norm_u_q_weight,
	norm_u_k_weight,
	heads,
	head_dim,
	q_cat_out,
	k_cat_out,
	v_cat_out,
	rope_seq_len=None,
	eps_v=1e-6,
	eps_a=1e-6,
	eps_u=1e-6,
	):
	if rope_seq_len is None:
	rope_seq_len = packed_v.shape[1]
	self._ops.qkv_split_joint3_cat_bf16(
	packed_v,
	qkv_v_bias,
	norm_v_q_weight,
	norm_v_k_weight,
	freqs_re,
	freqs_im,
	packed_a,
	norm_a_q_weight,
	norm_a_k_weight,
	packed_u,
	norm_u_q_weight,
	norm_u_k_weight,
	int(heads),
	int(head_dim),
	int(rope_seq_len),
	float(eps_v),
	float(eps_a),
	float(eps_u),
	q_cat_out,
	k_cat_out,
	v_cat_out,
	)
	return q_cat_out, k_cat_out, v_cat_out

	def qkv_split_rope_kvcache_bf16(
	self,
	packed_qkv,
	rope,
	q_heads,
	kv_heads,
	head_dim,
	cache_offset,
	q_out=None,
	k_cache=None,
	v_cache=None,
	max_seq_len=None,
	):
	batch, seq_len, _ = packed_qkv.shape
	if q_out is None:
	q_out = torch.empty(
	(batch, seq_len, q_heads, head_dim),
	device=packed_qkv.device,
	dtype=torch.bfloat16,
	)
	if k_cache is None or v_cache is None:
	if max_seq_len is None:
	max_seq_len = cache_offset + seq_len
	cache_shape = (batch, max_seq_len, kv_heads, head_dim)
	if k_cache is None:
	k_cache = torch.empty(cache_shape, device=packed_qkv.device, dtype=torch.bfloat16)
	if v_cache is None:
	v_cache = torch.empty(cache_shape, device=packed_qkv.device, dtype=torch.bfloat16)
	self._ops.qkv_split_rope_kvcache_bf16(
	packed_qkv,
	rope,
	int(q_heads),
	int(kv_heads),
	int(head_dim),
	int(cache_offset),
	q_out,
	k_cache,
	v_cache,
	)
	return q_out, k_cache, v_cache


	def _preload_cublaslt() -> None:
	for parent in Path(torch.__file__).resolve().parents:
	candidate = parent / "nvidia" / "cublas" / "lib" / "libcublasLt.so.12"
	if candidate.exists():
	ctypes.CDLL(str(candidate), mode=ctypes.RTLD_GLOBAL)
	return
	library = ctypes.util.find_library("cublasLt")
	if library:
	ctypes.CDLL(library, mode=ctypes.RTLD_GLOBAL)


	def _current_arch_list() -> str:
	major, minor = torch.cuda.get_device_capability(0)
	return f"{major}.{minor}"


	def load_source_ops() -> SourceOps:
	from torch.utils.cpp_extension import load

	if not REGISTRATION_INCLUDE.is_dir():
	raise RuntimeError(f"missing kernel-builder registration include: {REGISTRATION_INCLUDE}")
	_preload_cublaslt()
	os.environ.setdefault("TORCH_CUDA_ARCH_LIST", _current_arch_list())
	namespace = "flashrt_qkv_cache_rope_benchmark"
	load(
	name=namespace,
	sources=[
	str(PACKAGE / "torch-ext" / "torch_binding.cpp"),
	str(PACKAGE / "csrc" / "qkv_cache_rope.cu"),
	],
	extra_include_paths=[str(PACKAGE / "csrc"), str(REGISTRATION_INCLUDE)],
	extra_cflags=["-O3", "-DCUDA_KERNEL"],
	extra_cuda_cflags=["-O3", "--expt-relaxed-constexpr", "-DCUDA_KERNEL"],
	verbose=False,
	)
	return SourceOps(namespace)


	def load_installed_ops(artifact: str \| None):
	if artifact:
	sys.path.insert(0, artifact)
	try:
	return importlib.import_module("flashrt_qkv_cache_rope")
	finally:
	if artifact:
	sys.path.remove(artifact)


	def make_freqs(seq_len: int, head_dim: int):
	theta = torch.randn((seq_len, head_dim // 2), device="cuda", dtype=torch.float32)
	return torch.cos(theta).contiguous(), torch.sin(theta).contiguous()


	def make_interleaved_rope(seq_len: int, head_dim: int):
	theta = torch.randn((seq_len, head_dim // 2), device="cuda", dtype=torch.float32)
	cos = torch.cos(theta).to(torch.bfloat16)
	sin = torch.sin(theta).to(torch.bfloat16)
	return torch.stack([cos, sin], dim=-1).reshape(seq_len, head_dim).contiguous()


	def make_case(batch: int, seq_len: int, heads: int, head_dim: int):
	dim = heads * head_dim
	packed = torch.randn((batch, seq_len, 3 * dim), device="cuda", dtype=torch.bfloat16)
	q_w = (1.0 + 0.1 * torch.randn((dim,), device="cuda", dtype=torch.bfloat16)).contiguous()
	k_w = (1.0 + 0.1 * torch.randn((dim,), device="cuda", dtype=torch.bfloat16)).contiguous()
	freqs_re, freqs_im = make_freqs(seq_len, head_dim)
	q_out = torch.empty((batch, seq_len, heads, head_dim), device="cuda", dtype=torch.bfloat16)
	k_out = torch.empty_like(q_out)
	return packed, q_w, k_w, freqs_re, freqs_im, q_out, k_out


	def make_decode_case(heads: int):
	q = torch.randn((heads, 128), device="cuda", dtype=torch.bfloat16)
	k = torch.randn((heads, 128), device="cuda", dtype=torch.bfloat16)
	v = torch.randn((heads, 128), device="cuda", dtype=torch.bfloat16)
	q_w = (1.0 + 0.1 * torch.randn((128,), device="cuda", dtype=torch.bfloat16)).contiguous()
	k_w = (1.0 + 0.1 * torch.randn((128,), device="cuda", dtype=torch.bfloat16)).contiguous()
	theta = torch.randn((64,), device="cuda", dtype=torch.float32)
	cos = torch.cos(theta).to(torch.bfloat16).contiguous()
	sin = torch.sin(theta).to(torch.bfloat16).contiguous()
	return q, k, v, q_w, k_w, cos, sin


	def rms_norm(x: torch.Tensor, weight: torch.Tensor, eps: float):
	rms = torch.rsqrt(torch.mean(x.float() * x.float(), dim=-1, keepdim=True) + eps)
	return x.float() * rms * weight.float()


	def apply_pair_rope(x: torch.Tensor, freqs_re: torch.Tensor, freqs_im: torch.Tensor):
	batch, seq_len, heads, head_dim = x.shape
	pair = x.float().reshape(batch, seq_len, heads, head_dim // 2, 2)
	re = pair[..., 0]
	im = pair[..., 1]
	fr = freqs_re.view(1, seq_len, 1, head_dim // 2)
	fi = freqs_im.view(1, seq_len, 1, head_dim // 2)
	out = torch.empty_like(pair.float())
	out[..., 0] = re * fr - im * fi
	out[..., 1] = re * fi + im * fr
	return out.reshape(batch, seq_len, heads, head_dim).to(torch.bfloat16)


	def apply_interleaved_pair_rope(x: torch.Tensor, rope: torch.Tensor):
	batch, seq_len, heads, head_dim = x.shape
	pair = x.float().reshape(batch, seq_len, heads, head_dim // 2, 2)
	re = pair[..., 0]
	im = pair[..., 1]
	rope_pair = rope[:seq_len].float().reshape(seq_len, head_dim // 2, 2)
	cos = rope_pair[..., 0].view(1, seq_len, 1, head_dim // 2)
	sin = rope_pair[..., 1].view(1, seq_len, 1, head_dim // 2)
	out = torch.empty_like(pair.float())
	out[..., 0] = re * cos - im * sin
	out[..., 1] = re * sin + im * cos
	return out.reshape(batch, seq_len, heads, head_dim).to(torch.bfloat16)


	def apply_rotate_half_rope_128(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor):
	xf = x.float()
	out = torch.empty_like(xf)
	c = cos.float().view(1, 64)
	s = sin.float().view(1, 64)
	out[:, :64] = xf[:, :64] * c - xf[:, 64:] * s
	out[:, 64:] = xf[:, 64:] * c + xf[:, :64] * s
	return out.to(torch.bfloat16)


	def torch_ref(packed, q_w, k_w, freqs_re, freqs_im, heads, head_dim, eps):
	batch, seq_len, _ = packed.shape
	dim = heads * head_dim
	q = packed[:, :, :dim]
	k = packed[:, :, dim : 2 * dim]
	qn = rms_norm(q, q_w, eps).to(torch.bfloat16).view(batch, seq_len, heads, head_dim)
	kn = rms_norm(k, k_w, eps).to(torch.bfloat16).view(batch, seq_len, heads, head_dim)
	return apply_pair_rope(qn, freqs_re, freqs_im), apply_pair_rope(kn, freqs_re, freqs_im)


	def torch_ref_bias(packed, qkv_bias, q_w, k_w, freqs_re, freqs_im, heads, head_dim, eps):
	batch, seq_len, _ = packed.shape
	dim = heads * head_dim
	biased = packed.float() + qkv_bias.float().view(1, 1, 3 * dim)
	q = biased[:, :, :dim]
	k = biased[:, :, dim : 2 * dim]
	v = biased[:, :, 2 * dim :].to(torch.bfloat16).view(batch, seq_len, heads, head_dim)
	qn = rms_norm(q, q_w, eps).to(torch.bfloat16).view(batch, seq_len, heads, head_dim)
	kn = rms_norm(k, k_w, eps).to(torch.bfloat16).view(batch, seq_len, heads, head_dim)
	return apply_pair_rope(qn, freqs_re, freqs_im), apply_pair_rope(kn, freqs_re, freqs_im), v


	def torch_ref_no_rope(packed, q_w, k_w, heads, head_dim, eps):
	batch, seq_len, _ = packed.shape
	dim = heads * head_dim
	q = packed[:, :, :dim]
	k = packed[:, :, dim : 2 * dim]
	v = packed[:, :, 2 * dim :].view(batch, seq_len, heads, head_dim)
	qn = rms_norm(q, q_w, eps).to(torch.bfloat16).view(batch, seq_len, heads, head_dim)
	kn = rms_norm(k, k_w, eps).to(torch.bfloat16).view(batch, seq_len, heads, head_dim)
	return qn, kn, v


	def torch_ref_decode(x, weight, cos, sin, eps):
	return apply_rotate_half_rope_128(rms_norm(x, weight, eps).to(torch.bfloat16), cos, sin)


	def torch_ref_kvcache(packed_qkv, rope, q_heads, kv_heads, head_dim):
	batch, seq_len, _ = packed_qkv.shape
	q_dim = q_heads * head_dim
	kv_dim = kv_heads * head_dim
	q = packed_qkv[:, :, :q_dim].view(batch, seq_len, q_heads, head_dim)
	k = packed_qkv[:, :, q_dim : q_dim + kv_dim].view(batch, seq_len, kv_heads, head_dim)
	v = packed_qkv[:, :, q_dim + kv_dim :].view(batch, seq_len, kv_heads, head_dim)
	return apply_interleaved_pair_rope(q, rope), apply_interleaved_pair_rope(k, rope), v


	def make_joint3_case(video_len: int, action_len: int, und_len: int, heads: int, head_dim: int):
	packed_v, v_q_w, v_k_w, freqs_re, freqs_im, _, _ = make_case(1, video_len, heads, head_dim)
	packed_a, a_q_w, a_k_w, _, _, _, _ = make_case(1, action_len, heads, head_dim)
	packed_u, u_q_w, u_k_w, _, _, _, _ = make_case(1, und_len, heads, head_dim)
	dim = heads * head_dim
	qkv_v_bias = (0.02 * torch.randn((3 * dim,), device="cuda", dtype=torch.bfloat16)).contiguous()
	total = video_len + action_len + und_len
	q_cat = torch.empty((1, total, heads, head_dim), device="cuda", dtype=torch.bfloat16)
	k_cat = torch.empty_like(q_cat)
	v_cat = torch.empty_like(q_cat)
	return (
	packed_v,
	qkv_v_bias,
	v_q_w,
	v_k_w,
	freqs_re,
	freqs_im,
	packed_a,
	a_q_w,
	a_k_w,
	packed_u,
	u_q_w,
	u_k_w,
	q_cat,
	k_cat,
	v_cat,
	)


	def torch_ref_joint3(
	packed_v,
	qkv_v_bias,
	v_q_w,
	v_k_w,
	freqs_re,
	freqs_im,
	packed_a,
	a_q_w,
	a_k_w,
	packed_u,
	u_q_w,
	u_k_w,
	heads,
	head_dim,
	eps,
	):
	qv, kv, vv = torch_ref_bias(packed_v, qkv_v_bias, v_q_w, v_k_w, freqs_re, freqs_im, heads, head_dim, eps)
	qa, ka, va = torch_ref_no_rope(packed_a, a_q_w, a_k_w, heads, head_dim, eps)
	qu, ku, vu = torch_ref_no_rope(packed_u, u_q_w, u_k_w, heads, head_dim, eps)
	return torch.cat([qv, qa, qu], dim=1), torch.cat([kv, ka, ku], dim=1), torch.cat([vv, va, vu], dim=1)


	def time_us(fn, warmup: int, iters: int) -> float:
	for _ in range(warmup):
	fn()
	torch.cuda.synchronize()
	start = torch.cuda.Event(enable_timing=True)
	end = torch.cuda.Event(enable_timing=True)
	start.record()
	for _ in range(iters):
	fn()
	end.record()
	torch.cuda.synchronize()
	return start.elapsed_time(end) * 1000.0 / iters


	def percentile(x: torch.Tensor, q: float) -> torch.Tensor:
	flat = x.flatten()
	k = max(1, min(flat.numel(), math.ceil(q * flat.numel())))
	return flat.kthvalue(k).values


	def metrics(got, expected):
	diff = (got.float() - expected.float()).abs().flatten()
	return float(percentile(diff, 0.99).item()), float(
	torch.nn.functional.cosine_similarity(got.float().flatten(), expected.float().flatten(), dim=0).item()
	)


	def run_one(ops, name: str, shape: tuple[int, int, int, int], args) -> Result:
	batch, seq_len, heads, head_dim = shape
	packed, q_w, k_w, freqs_re, freqs_im, q_out, k_out = make_case(*shape)
	eps = args.eps
	got_q, got_k = ops.qkv_split_norm_rope_bf16(
	packed, q_w, k_w, freqs_re, freqs_im, heads, head_dim, seq_len, eps, q_out, k_out
	)
	exp_q, exp_k = torch_ref(packed, q_w, k_w, freqs_re, freqs_im, heads, head_dim, eps)
	q_p99, q_cos = metrics(got_q, exp_q)
	k_p99, k_cos = metrics(got_k, exp_k)
	flashrt_us = time_us(
	lambda: ops.qkv_split_norm_rope_bf16(
	packed, q_w, k_w, freqs_re, freqs_im, heads, head_dim, seq_len, eps, q_out, k_out
	),
	args.warmup,
	args.iters,
	)
	eager_us = time_us(
	lambda: torch_ref(packed, q_w, k_w, freqs_re, freqs_im, heads, head_dim, eps),
	args.warmup,
	args.iters,
	)
	status = "PASS" if q_p99 <= args.p99_abs_limit and k_p99 <= args.p99_abs_limit else "FAIL"
	return Result(
	shape=name,
	batch=batch,
	seq_len=seq_len,
	heads=heads,
	head_dim=head_dim,
	flashrt_us=flashrt_us,
	torch_eager_us=eager_us,
	speedup_vs_eager=eager_us / flashrt_us,
	q_p99_abs=q_p99,
	k_p99_abs=k_p99,
	q_cosine=q_cos,
	k_cosine=k_cos,
	status=status,
	)


	def run_joint3(ops, name: str, video_len: int, action_len: int, und_len: int, heads: int, head_dim: int, args) -> Result:
	case = make_joint3_case(video_len, action_len, und_len, heads, head_dim)
	(
	packed_v,
	qkv_v_bias,
	v_q_w,
	v_k_w,
	freqs_re,
	freqs_im,
	packed_a,
	a_q_w,
	a_k_w,
	packed_u,
	u_q_w,
	u_k_w,
	q_cat,
	k_cat,
	v_cat,
	) = case
	eps = args.eps
	got_q, got_k, _ = ops.qkv_split_joint3_cat_bf16(
	packed_v,
	qkv_v_bias,
	v_q_w,
	v_k_w,
	freqs_re,
	freqs_im,
	packed_a,
	a_q_w,
	a_k_w,
	packed_u,
	u_q_w,
	u_k_w,
	heads,
	head_dim,
	q_cat,
	k_cat,
	v_cat,
	video_len,
	eps,
	eps,
	eps,
	)
	exp_q, exp_k, _ = torch_ref_joint3(
	packed_v,
	qkv_v_bias,
	v_q_w,
	v_k_w,
	freqs_re,
	freqs_im,
	packed_a,
	a_q_w,
	a_k_w,
	packed_u,
	u_q_w,
	u_k_w,
	heads,
	head_dim,
	eps,
	)
	q_p99, q_cos = metrics(got_q, exp_q)
	k_p99, k_cos = metrics(got_k, exp_k)
	flashrt_us = time_us(
	lambda: ops.qkv_split_joint3_cat_bf16(
	packed_v,
	qkv_v_bias,
	v_q_w,
	v_k_w,
	freqs_re,
	freqs_im,
	packed_a,
	a_q_w,
	a_k_w,
	packed_u,
	u_q_w,
	u_k_w,
	heads,
	head_dim,
	q_cat,
	k_cat,
	v_cat,
	video_len,
	eps,
	eps,
	eps,
	),
	args.warmup,
	args.iters,
	)
	eager_us = time_us(
	lambda: torch_ref_joint3(
	packed_v,
	qkv_v_bias,
	v_q_w,
	v_k_w,
	freqs_re,
	freqs_im,
	packed_a,
	a_q_w,
	a_k_w,
	packed_u,
	u_q_w,
	u_k_w,
	heads,
	head_dim,
	eps,
	),
	args.warmup,
	args.iters,
	)
	status = "PASS" if q_p99 <= args.p99_abs_limit and k_p99 <= args.p99_abs_limit else "FAIL"
	return Result(
	shape=name,
	batch=1,
	seq_len=video_len + action_len + und_len,
	heads=heads,
	head_dim=head_dim,
	flashrt_us=flashrt_us,
	torch_eager_us=eager_us,
	speedup_vs_eager=eager_us / flashrt_us,
	q_p99_abs=q_p99,
	k_p99_abs=k_p99,
	q_cosine=q_cos,
	k_cosine=k_cos,
	status=status,
	)


	def run_decode_q(ops, name: str, heads: int, args) -> Result:
	q, _, _, q_w, _, cos, sin = make_decode_case(heads)
	q_out = torch.empty_like(q)
	eps = args.eps
	got = ops.decode_q_norm_rope_stage_bf16(q, q_w, cos, sin, eps, q_out)
	exp = torch_ref_decode(q, q_w, cos, sin, eps)
	q_p99, q_cos = metrics(got, exp)
	flashrt_us = time_us(
	lambda: ops.decode_q_norm_rope_stage_bf16(q, q_w, cos, sin, eps, q_out),
	args.warmup,
	args.iters,
	)
	eager_us = time_us(lambda: torch_ref_decode(q, q_w, cos, sin, eps), args.warmup, args.iters)
	status = "PASS" if q_p99 <= args.p99_abs_limit else "FAIL"
	return Result(
	shape=name,
	batch=1,
	seq_len=1,
	heads=heads,
	head_dim=128,
	flashrt_us=flashrt_us,
	torch_eager_us=eager_us,
	speedup_vs_eager=eager_us / flashrt_us,
	q_p99_abs=q_p99,
	k_p99_abs=0.0,
	q_cosine=q_cos,
	k_cosine=1.0,
	status=status,
	)


	def run_decode_kv(ops, name: str, heads: int, devpos: bool, args) -> Result:
	_, k, v, _, k_w, cos, sin = make_decode_case(heads)
	k_slot = torch.empty_like(k)
	v_slot = torch.empty_like(v)
	eps = args.eps
	exp_k = torch_ref_decode(k, k_w, cos, sin, eps)
	if devpos:
	pos = 3
	k_cache = torch.empty((8, heads, 128), device="cuda", dtype=torch.bfloat16)
	v_cache = torch.empty_like(k_cache)
	cur_pos = torch.tensor([pos], device="cuda", dtype=torch.int32)

	def flashrt_fn():
	return ops.decode_k_norm_rope_kvwrite_devpos_bf16(k, v, k_w, cos, sin, cur_pos, k_cache, v_cache, eps)

	def eager_fn():
	k_cache[pos].copy_(torch_ref_decode(k, k_w, cos, sin, eps))
	v_cache[pos].copy_(v)
	return k_cache, v_cache

	flashrt_fn()
	got_k = k_cache[pos]
	got_v = v_cache[pos]
	else:
	def flashrt_fn():
	return ops.decode_k_norm_rope_kvwrite_bf16(k, v, k_w, cos, sin, eps, k_slot, v_slot)

	def eager_fn():
	k_slot.copy_(torch_ref_decode(k, k_w, cos, sin, eps))
	v_slot.copy_(v)
	return k_slot, v_slot

	got_k, got_v = flashrt_fn()
	k_p99, k_cos = metrics(got_k, exp_k)
	v_p99, v_cos = metrics(got_v, v)
	flashrt_us = time_us(flashrt_fn, args.warmup, args.iters)
	eager_us = time_us(eager_fn, args.warmup, args.iters)
	status = "PASS" if k_p99 <= args.p99_abs_limit and v_p99 == 0.0 else "FAIL"
	return Result(
	shape=name,
	batch=1,
	seq_len=1,
	heads=heads,
	head_dim=128,
	flashrt_us=flashrt_us,
	torch_eager_us=eager_us,
	speedup_vs_eager=eager_us / flashrt_us,
	q_p99_abs=v_p99,
	k_p99_abs=k_p99,
	q_cosine=v_cos,
	k_cosine=k_cos,
	status=status,
	)


	def run_kvcache_gqa(
	ops,
	name: str,
	batch: int,
	seq_len: int,
	q_heads: int,
	kv_heads: int,
	head_dim: int,
	args,
	) -> Result:
	qkv_dim = (q_heads + 2 * kv_heads) * head_dim
	packed = torch.randn((batch, seq_len, qkv_dim), device="cuda", dtype=torch.bfloat16)
	rope = make_interleaved_rope(seq_len, head_dim)
	cache_offset = 2
	max_seq_len = cache_offset + seq_len + 2
	q_out = torch.empty((batch, seq_len, q_heads, head_dim), device="cuda", dtype=torch.bfloat16)
	k_cache = torch.empty((batch, max_seq_len, kv_heads, head_dim), device="cuda", dtype=torch.bfloat16)
	v_cache = torch.empty_like(k_cache)
	got_q, got_k_cache, got_v_cache = ops.qkv_split_rope_kvcache_bf16(
	packed,
	rope,
	q_heads,
	kv_heads,
	head_dim,
	cache_offset,
	q_out,
	k_cache,
	v_cache,
	)
	exp_q, exp_k, exp_v = torch_ref_kvcache(packed, rope, q_heads, kv_heads, head_dim)
	sl = slice(cache_offset, cache_offset + seq_len)
	q_p99, q_cos = metrics(got_q, exp_q)
	k_p99, k_cos = metrics(got_k_cache[:, sl], exp_k)
	v_p99, v_cos = metrics(got_v_cache[:, sl], exp_v)

	def flashrt_fn():
	return ops.qkv_split_rope_kvcache_bf16(
	packed,
	rope,
	q_heads,
	kv_heads,
	head_dim,
	cache_offset,
	q_out,
	k_cache,
	v_cache,
	)

	def eager_fn():
	exp_q_local, exp_k_local, exp_v_local = torch_ref_kvcache(packed, rope, q_heads, kv_heads, head_dim)
	q_out.copy_(exp_q_local)
	k_cache[:, sl].copy_(exp_k_local)
	v_cache[:, sl].copy_(exp_v_local)
	return q_out, k_cache, v_cache

	flashrt_us = time_us(flashrt_fn, args.warmup, args.iters)
	eager_us = time_us(eager_fn, args.warmup, args.iters)
	status = (
	"PASS"
	if q_p99 <= args.p99_abs_limit and k_p99 <= args.p99_abs_limit and v_p99 == 0.0
	else "FAIL"
	)
	return Result(
	shape=name,
	batch=batch,
	seq_len=seq_len,
	heads=q_heads,
	head_dim=head_dim,
	flashrt_us=flashrt_us,
	torch_eager_us=eager_us,
	speedup_vs_eager=eager_us / flashrt_us,
	q_p99_abs=max(q_p99, v_p99),
	k_p99_abs=k_p99,
	q_cosine=min(q_cos, v_cos),
	k_cosine=k_cos,
	status=status,
	)


	def write_markdown(path: Path, results: list[Result]) -> None:
	lines = [
	"\| Shape \| B,L,H,D \| FlashRT us \| Eager us \| vs eager \| Q p99 \| K p99 \| Q cosine \| K cosine \| Status \|",
	"\|---\|---:\|---:\|---:\|---:\|---:\|---:\|---:\|---:\|---\|",
	]
	for r in results:
	lines.append(
	f"\| {r.shape} \| {r.batch},{r.seq_len},{r.heads},{r.head_dim} \| "
	f"{r.flashrt_us:.3f} \| {r.torch_eager_us:.3f} \| {r.speedup_vs_eager:.2f}x \| "
	f"{r.q_p99_abs:.6f} \| {r.k_p99_abs:.6f} \| {r.q_cosine:.8f} \| "
	f"{r.k_cosine:.8f} \| {r.status} \|"
	)
	path.write_text("\n".join(lines) + "\n")


	def main() -> None:
	parser = argparse.ArgumentParser()
	parser.add_argument("--backend", choices=["source", "installed"], default="source")
	parser.add_argument("--artifact", default=None)
	parser.add_argument("--shapes", choices=sorted(SHAPE_GROUPS), default="smoke")
	parser.add_argument("--warmup", type=int, default=5)
	parser.add_argument("--iters", type=int, default=20)
	parser.add_argument("--eps", type=float, default=1e-6)
	parser.add_argument("--p99-abs-limit", type=float, default=0.015625)
	parser.add_argument("--output", default=None)
	parser.add_argument("--markdown", default=None)
	args = parser.parse_args()

	if not torch.cuda.is_available():
	raise SystemExit("CUDA is required")
	torch.manual_seed(37)
	ops = load_source_ops() if args.backend == "source" else load_installed_ops(args.artifact)
	results = [run_one(ops, name, SHAPES[name], args) for name in SHAPE_GROUPS[args.shapes]]
	if args.shapes in ("smoke", "all"):
	results.append(run_joint3(ops, "joint3_small", 64, 8, 4, 8, 128, args))
	results.append(run_kvcache_gqa(ops, "pi05_decoder_gqa_kvcache", 1, 10, 8, 1, 256, args))
	if args.shapes in ("headline", "all"):
	results.append(run_joint3(ops, "joint3_vla", 2520, 16, 16, 24, 128, args))
	results.append(run_decode_q(ops, "decode_q_stage_h24", 24, args))
	results.append(run_decode_kv(ops, "decode_kvwrite_h8", 8, False, args))
	results.append(run_decode_kv(ops, "decode_kvwrite_devpos_h8", 8, True, args))
	if args.shapes == "headline":
	results.append(run_kvcache_gqa(ops, "pi05_decoder_gqa_kvcache", 1, 10, 8, 1, 256, args))

	for r in results:
	print(
	f"{r.status} {r.shape}: flashrt={r.flashrt_us:.3f}us "
	f"eager={r.torch_eager_us:.3f}us speedup={r.speedup_vs_eager:.2f}x "
	f"q_p99={r.q_p99_abs:.6f} k_p99={r.k_p99_abs:.6f}"
	)

	if args.output:
	Path(args.output).parent.mkdir(parents=True, exist_ok=True)
	Path(args.output).write_text(json.dumps([asdict(r) for r in results], indent=2) + "\n")
	if args.markdown:
	Path(args.markdown).parent.mkdir(parents=True, exist_ok=True)
	write_markdown(Path(args.markdown), results)

	if any(r.status != "PASS" for r in results):
	raise SystemExit(1)


	if __name__ == "__main__":
	main()