Uploaded using `kernel-builder`.

34562e4 verified 5 days ago

18.7 kB

	#!/usr/bin/env python3
	"""Benchmark flashrt-fp8-ffn against PyTorch eager/compile references."""

	from __future__ import annotations

	import argparse
	import ctypes
	import ctypes.util
	import importlib
	import json
	import math
	import os
	import sys
	from dataclasses import asdict, dataclass
	from pathlib import Path

	import torch


	ROOT = Path(__file__).resolve().parents[2]
	PACKAGE = ROOT / "flashrt-fp8-ffn"
	REGISTRATION_INCLUDE = (
	ROOT.parent
	/ "kernels"
	/ "kernel-builder"
	/ "src"
	/ "pyproject"
	/ "templates"
	/ "torch"
	)


	SHAPES = {
	# PI0.5 decoder chunks. Production default is 10 denoising steps.
	"pi05_decoder_ffn_m1": (1, 1024, 4096, 1024, 18),
	"pi05_decoder_ffn_m8": (8, 1024, 4096, 1024, 18),
	"pi05_decoder_ffn_m10": (10, 1024, 4096, 1024, 18),
	"pi05_decoder_ffn_m16": (16, 1024, 4096, 1024, 18),
	# Backward-compatible headline alias.
	"pi05_decoder_ffn": (10, 1024, 4096, 1024, 18),
	# PI0.5 SigLIP-L FFN. One view is 256 visual tokens.
	"pi05_vision_ffn_1view": (256, 1152, 4304, 1152, 27),
	"pi05_vision_ffn_2view": (512, 1152, 4304, 1152, 27),
	"pi05_vision_ffn_3view": (768, 1152, 4304, 1152, 27),
	# GROOT/Qwen3-VL ViT FFN.
	"groot_vit_ffn_1view": (256, 1024, 4096, 1024, 24),
	"groot_vit_ffn_2view": (512, 1024, 4096, 1024, 24),
	"groot_vit_ffn_4view": (1024, 1024, 4096, 1024, 24),
	# GROOT DeepStack merger. Two-view ViT taps produce 128 merged tokens.
	"groot_deepstack_merge_2view": (128, 4096, 4096, 2048, 3),
	# GROOT VL self-attention FFN. Sequence length changes with vision/text mix.
	"groot_vl_self_attn_ffn_seq512": (512, 2048, 8192, 2048, 4),
	"groot_vl_self_attn_ffn_seq1024": (1024, 2048, 8192, 2048, 4),
	"groot_vl_self_attn_ffn_seq2520": (2520, 2048, 8192, 2048, 4),
	# Backward-compatible headline alias.
	"groot_vl_self_attn_ffn": (1024, 2048, 8192, 2048, 4),
	# GROOT action DiT GELU FFN. This is exact GELU shape, but the production
	# path currently uses BF16 GEMMs; report it as a shape fit, not a deployed
	# FP8 action-head claim until model wiring is done.
	"groot_action_dit_ffn": (41, 1536, 6144, 1536, 32),
	}

	SHAPE_GROUPS = {
	"headline": [
	"pi05_decoder_ffn_m10",
	"pi05_vision_ffn_2view",
	"groot_vit_ffn_2view",
	"groot_vl_self_attn_ffn_seq1024",
	],
	"pi05": [
	"pi05_decoder_ffn_m1",
	"pi05_decoder_ffn_m8",
	"pi05_decoder_ffn_m10",
	"pi05_decoder_ffn_m16",
	"pi05_vision_ffn_1view",
	"pi05_vision_ffn_2view",
	"pi05_vision_ffn_3view",
	],
	"groot": [
	"groot_vit_ffn_1view",
	"groot_vit_ffn_2view",
	"groot_vit_ffn_4view",
	"groot_deepstack_merge_2view",
	"groot_vl_self_attn_ffn_seq512",
	"groot_vl_self_attn_ffn_seq1024",
	"groot_vl_self_attn_ffn_seq2520",
	"groot_action_dit_ffn",
	],
	}
	SHAPE_GROUPS["all"] = SHAPE_GROUPS["pi05"] + SHAPE_GROUPS["groot"]


	@dataclass
	class Result:
	shape: str
	M: int
	K: int
	H: int
	N: int
	layers: int
	flashrt_us: float
	torch_eager_us: float
	torch_compile_us: float \| None
	speedup_vs_eager: float
	speedup_vs_compile: float \| None
	compile_status: str
	max_abs: float
	p99_abs: float
	p99_rel_floor1: float
	max_rel_floor1: float
	status: str


	class SourceOps:
	def __init__(self, namespace: str) -> None:
	self._ops = getattr(torch.ops, namespace)

	def fp8_gelu_mlp_bf16(
	self,
	x,
	up_w,
	up_b,
	down_w,
	down_b,
	x_scale,
	up_w_scale,
	hidden_scale,
	down_w_scale,
	hidden=None,
	hidden_fp8=None,
	out=None,
	):
	if hidden is None:
	hidden = torch.empty((x.shape[0], up_w.shape[0]), device=x.device, dtype=torch.bfloat16)
	if hidden_fp8 is None:
	hidden_fp8 = torch.empty_like(hidden, dtype=fp8_dtype())
	if out is None:
	out = torch.empty((x.shape[0], down_w.shape[0]), device=x.device, dtype=torch.bfloat16)
	self._ops.fp8_gelu_mlp_bf16(
	x,
	up_w,
	up_b,
	down_w,
	down_b,
	x_scale,
	up_w_scale,
	hidden_scale,
	down_w_scale,
	hidden,
	hidden_fp8,
	out,
	)
	return out


	def _preload_cublaslt() -> None:
	for parent in Path(torch.__file__).resolve().parents:
	candidate = parent / "nvidia" / "cublas" / "lib" / "libcublasLt.so.12"
	if candidate.exists():
	ctypes.CDLL(str(candidate), mode=ctypes.RTLD_GLOBAL)
	return
	library = ctypes.util.find_library("cublasLt")
	if library:
	ctypes.CDLL(library, mode=ctypes.RTLD_GLOBAL)


	def _current_arch_list() -> str:
	major, minor = torch.cuda.get_device_capability(0)
	return f"{major}.{minor}"


	def load_source_ops() -> SourceOps:
	from torch.utils.cpp_extension import load

	if not REGISTRATION_INCLUDE.is_dir():
	raise RuntimeError(f"missing kernel-builder registration include: {REGISTRATION_INCLUDE}")
	_preload_cublaslt()
	os.environ.setdefault("TORCH_CUDA_ARCH_LIST", _current_arch_list())
	namespace = "flashrt_fp8_ffn_benchmark"
	load(
	name=namespace,
	sources=[
	str(PACKAGE / "torch-ext" / "torch_binding.cpp"),
	str(PACKAGE / "csrc" / "fp8_ffn.cu"),
	],
	extra_include_paths=[str(PACKAGE / "csrc"), str(REGISTRATION_INCLUDE)],
	extra_cflags=["-O3", "-DCUDA_KERNEL"],
	extra_cuda_cflags=["-O3", "--expt-relaxed-constexpr", "-DCUDA_KERNEL"],
	verbose=False,
	)
	return SourceOps(namespace)


	def load_installed_ops(artifact: str \| None):
	if artifact:
	sys.path.insert(0, artifact)
	try:
	return importlib.import_module("flashrt_fp8_ffn")
	finally:
	if artifact:
	sys.path.remove(artifact)


	def load_hub_ops(repo_id: str, version: int):
	from kernels import get_kernel

	return get_kernel(repo_id, version=version)


	def fp8_dtype() -> torch.dtype:
	if torch.version.hip is not None and hasattr(torch, "float8_e4m3fnuz"):
	return torch.float8_e4m3fnuz
	return torch.float8_e4m3fn


	def fp8_max() -> float:
	return 240.0 if torch.version.hip is not None else 448.0


	def quantize_fp8(x: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
	limit = fp8_max()
	return torch.clamp(x.float() / scale.float(), -limit, limit).to(fp8_dtype())


	def dequant_fp8(x: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
	return x.float() * scale.float()


	def compiler_disable(fn):
	compiler = getattr(torch, "compiler", None)
	if compiler is not None and hasattr(compiler, "disable"):
	return compiler.disable(fn)
	return torch._dynamo.disable(fn)


	def gelu_quantize_fp8_boundary(
	hidden: torch.Tensor, bias: torch.Tensor, scale: torch.Tensor
	) -> torch.Tensor:
	hidden = torch.nn.functional.gelu(
	hidden.float() + bias.float(), approximate="tanh"
	)
	return quantize_fp8(hidden, scale)


	def bf16_bias_add_boundary(out: torch.Tensor, bias: torch.Tensor) -> torch.Tensor:
	return (out.float() + bias.float()).to(torch.bfloat16)


	stable_gelu_quantize_fp8 = compiler_disable(gelu_quantize_fp8_boundary)
	stable_bf16_bias_add = compiler_disable(bf16_bias_add_boundary)


	def torch_mlp(x, up_w, up_b, down_w, down_b, x_s, up_s, hid_s, dn_s):
	hidden = (dequant_fp8(x, x_s) @ dequant_fp8(up_w, up_s).T).to(torch.bfloat16)
	hidden = torch.nn.functional.gelu(hidden + up_b.float(), approximate="tanh")
	limit = fp8_max()
	hidden_fp8 = torch.clamp(hidden / hid_s.float(), -limit, limit).to(fp8_dtype())
	out = (dequant_fp8(hidden_fp8, hid_s) @ dequant_fp8(down_w, dn_s).T).to(torch.bfloat16)
	return (out + down_b.float()).to(torch.bfloat16)


	def torch_mlp_compile_stable(x, up_w, up_b, down_w, down_b, x_s, up_s, hid_s, dn_s):
	hidden = (dequant_fp8(x, x_s) @ dequant_fp8(up_w, up_s).T).to(torch.bfloat16)
	hidden_fp8 = stable_gelu_quantize_fp8(hidden, up_b, hid_s)
	out = (dequant_fp8(hidden_fp8, hid_s) @ dequant_fp8(down_w, dn_s).T).to(torch.bfloat16)
	return stable_bf16_bias_add(out, down_b)


	def make_inputs(M: int, K: int, H: int, N: int, layers: int):
	x_scale = torch.tensor([0.05], device="cuda", dtype=torch.float32)
	up_scale = torch.tensor([0.04], device="cuda", dtype=torch.float32)
	hidden_scale = torch.tensor([0.25], device="cuda", dtype=torch.float32)
	down_scale = torch.tensor([0.04], device="cuda", dtype=torch.float32)
	xs = [
	quantize_fp8(torch.randn((M, K), device="cuda", dtype=torch.bfloat16), x_scale)
	for _ in range(layers)
	]
	up_ws = [
	quantize_fp8(torch.randn((H, K), device="cuda", dtype=torch.bfloat16), up_scale)
	for _ in range(layers)
	]
	down_ws = [
	quantize_fp8(torch.randn((N, H), device="cuda", dtype=torch.bfloat16), down_scale)
	for _ in range(layers)
	]
	up_bs = [torch.randn((H,), device="cuda", dtype=torch.bfloat16) for _ in range(layers)]
	down_bs = [torch.randn((N,), device="cuda", dtype=torch.bfloat16) for _ in range(layers)]
	hidden = [torch.empty((M, H), device="cuda", dtype=torch.bfloat16) for _ in range(layers)]
	hidden_fp8 = [torch.empty((M, H), device="cuda", dtype=fp8_dtype()) for _ in range(layers)]
	outs = [torch.empty((M, N), device="cuda", dtype=torch.bfloat16) for _ in range(layers)]
	return xs, up_ws, up_bs, down_ws, down_bs, x_scale, up_scale, hidden_scale, down_scale, hidden, hidden_fp8, outs


	def time_us(fn, *, warmup: int, iters: int) -> float:
	for _ in range(warmup):
	fn()
	torch.cuda.synchronize()
	start = torch.cuda.Event(enable_timing=True)
	end = torch.cuda.Event(enable_timing=True)
	start.record()
	for _ in range(iters):
	fn()
	end.record()
	torch.cuda.synchronize()
	return start.elapsed_time(end) * 1000.0 / iters


	def percentile(x: torch.Tensor, q: float) -> torch.Tensor:
	flat = x.flatten()
	k = max(1, min(flat.numel(), math.ceil(q * flat.numel())))
	return flat.kthvalue(k).values


	def _outputs_close(got, expected) -> bool:
	if isinstance(got, (tuple, list)) and isinstance(expected, (tuple, list)):
	return len(got) == len(expected) and all(
	_outputs_close(g, e) for g, e in zip(got, expected)
	)
	return bool(torch.allclose(got, expected, rtol=3e-2, atol=1.25e-1))


	def compile_time_us(fn, expected, *, warmup: int, iters: int) -> tuple[float \| None, str]:
	try:
	compiled = torch.compile(fn, fullgraph=False, mode="reduce-overhead")
	compiled_out = compiled()
	torch.cuda.synchronize()
	if not _outputs_close(compiled_out, expected):
	return None, "unsupported: compiled reference output mismatch"
	return time_us(compiled, warmup=warmup, iters=iters), "ok"
	except Exception as exc: # noqa: BLE001
	return None, f"unsupported: {type(exc).__name__}: {exc}"


	def run_shape(ops, name: str, shape, args) -> Result:
	M, K, H, N, layers = shape
	xs, up_ws, up_bs, down_ws, down_bs, x_s, up_s, hid_s, dn_s, hidden, hidden_fp8, outs = make_inputs(
	M, K, H, N, layers
	)

	def flashrt_stack():
	result = []
	for i in range(layers):
	result.append(
	ops.fp8_gelu_mlp_bf16(
	xs[i],
	up_ws[i],
	up_bs[i],
	down_ws[i],
	down_bs[i],
	x_s,
	up_s,
	hid_s,
	dn_s,
	hidden[i],
	hidden_fp8[i],
	outs[i],
	)
	)
	return tuple(result)

	def torch_stack():
	return tuple(
	torch_mlp(xs[i], up_ws[i], up_bs[i], down_ws[i], down_bs[i], x_s, up_s, hid_s, dn_s)
	for i in range(layers)
	)

	def torch_stack_compile_stable():
	return tuple(
	torch_mlp_compile_stable(
	xs[i], up_ws[i], up_bs[i], down_ws[i], down_bs[i], x_s, up_s, hid_s, dn_s
	)
	for i in range(layers)
	)

	flashrt_stack()
	expected0 = torch_mlp(xs[0], up_ws[0], up_bs[0], down_ws[0], down_bs[0], x_s, up_s, hid_s, dn_s)
	diff = (outs[0].float() - expected0.float()).abs().flatten()
	rel = diff / expected0.float().abs().flatten().clamp_min(1.0)
	max_abs = float(diff.max().item())
	p99_abs = float(percentile(diff, 0.99).item())
	p99_rel = float(percentile(rel, 0.99).item())
	max_rel = float(rel.max().item())
	status = (
	"PASS"
	if p99_abs <= args.p99_abs_limit and p99_rel <= args.p99_rel_floor1_limit
	else "FAIL"
	)

	flashrt_us = time_us(flashrt_stack, warmup=args.warmup, iters=args.iters)
	eager_us = time_us(torch_stack, warmup=args.warmup, iters=args.iters)
	compile_us = None
	compile_status = "not_requested"
	if args.compile_baseline:
	eager_expected = torch_stack()
	stable_expected = torch_stack_compile_stable()
	torch.cuda.synchronize()
	if not _outputs_close(stable_expected, eager_expected):
	compile_status = "unsupported: stable compile reference differs from eager"
	else:
	compile_us, compile_status = compile_time_us(
	torch_stack_compile_stable,
	eager_expected,
	warmup=args.warmup,
	iters=args.iters,
	)

	return Result(
	shape=name,
	M=M,
	K=K,
	H=H,
	N=N,
	layers=layers,
	flashrt_us=flashrt_us,
	torch_eager_us=eager_us,
	torch_compile_us=compile_us,
	speedup_vs_eager=eager_us / flashrt_us,
	speedup_vs_compile=compile_us / flashrt_us if compile_us is not None else None,
	compile_status=compile_status,
	max_abs=max_abs,
	p99_abs=p99_abs,
	p99_rel_floor1=p99_rel,
	max_rel_floor1=max_rel,
	status=status,
	)


	def write_markdown(path: Path, results: list[Result], args) -> None:
	lines = [
	"# Benchmark Results: flashrt-fp8-ffn",
	"",
	f"- Backend: `{args.backend}`",
	f"- Device: `{torch.cuda.get_device_name(0)}`",
	f"- Torch: `{torch.__version__}`",
	f"- Warmup/iters: `{args.warmup}/{args.iters}`",
	f"- Precision gate: p99_abs <= `{args.p99_abs_limit}` and "
	f"p99_rel_floor1 <= `{args.p99_rel_floor1_limit}`",
	"- Compile baseline: reported only when compiled reference output "
	"matches eager reference output.",
	"",
	"\| Shape \| M,K,H,N \| Layers \| FlashRT us \| Eager us \| vs eager \| Compile us \| vs compile \| Compile status \| P99 abs \| P99 rel \| Max abs \| Status \|",
	"\|---\|---:\|---:\|---:\|---:\|---:\|---:\|---:\|---\|---:\|---:\|---:\|---:\|",
	]
	for r in results:
	compile_us = f"{r.torch_compile_us:.3f}" if r.torch_compile_us is not None else "n/a"
	compile_speedup = f"{r.speedup_vs_compile:.2f}x" if r.speedup_vs_compile is not None else "n/a"
	lines.append(
	f"\| {r.shape} \| {r.M},{r.K},{r.H},{r.N} \| {r.layers} \| "
	f"{r.flashrt_us:.3f} \| {r.torch_eager_us:.3f} \| {r.speedup_vs_eager:.2f}x \| "
	f"{compile_us} \| {compile_speedup} \| {r.compile_status} \| {r.p99_abs:.4f} \| "
	f"{r.p99_rel_floor1:.6f} \| {r.max_abs:.4f} \| {r.status} \|"
	)
	lines.append("")
	path.write_text("\n".join(lines), encoding="utf-8")


	def main() -> None:
	parser = argparse.ArgumentParser()
	parser.add_argument("--backend", choices=["source", "installed", "hub"], default="source")
	parser.add_argument("--artifact", default=None)
	parser.add_argument("--repo-id", default="flashrt/flashrt-fp8-ffn")
	parser.add_argument("--version", type=int, default=1)
	parser.add_argument("--shapes", default="all")
	parser.add_argument("--warmup", type=int, default=5)
	parser.add_argument("--iters", type=int, default=20)
	parser.add_argument("--compile-baseline", action="store_true")
	parser.add_argument("--p99-abs-limit", type=float, default=1.0)
	parser.add_argument("--p99-rel-floor1-limit", type=float, default=0.05)
	parser.add_argument("--output", type=Path, default=None)
	parser.add_argument("--markdown", type=Path, default=None)
	parser.add_argument("--list-shapes", action="store_true")
	args = parser.parse_args()

	if args.list_shapes:
	print("Shape groups:")
	for group, names in SHAPE_GROUPS.items():
	print(f" {group}: {','.join(names)}")
	print("\nShapes:")
	for name, shape in SHAPES.items():
	print(f" {name}: M,K,H,N,layers={shape}")
	return

	if not torch.cuda.is_available():
	raise SystemExit("CUDA is required")
	torch.manual_seed(17)
	if args.backend == "source":
	ops = load_source_ops()
	elif args.backend == "installed":
	ops = load_installed_ops(args.artifact)
	else:
	ops = load_hub_ops(args.repo_id, args.version)
	requested = [s.strip() for s in args.shapes.split(",")]
	names: list[str] = []
	for item in requested:
	if item in SHAPE_GROUPS:
	names.extend(SHAPE_GROUPS[item])
	else:
	names.append(item)
	unknown = [name for name in names if name not in SHAPES]
	if unknown:
	raise SystemExit(f"unknown shapes/groups: {unknown}")

	results = []
	for name in names:
	results.append(run_shape(ops, name, SHAPES[name], args))
	torch.cuda.empty_cache()

	for r in results:
	compile_part = (
	f", compile={r.torch_compile_us:.3f}us, vs_compile={r.speedup_vs_compile:.2f}x"
	if r.torch_compile_us is not None
	else f", compile={r.compile_status}"
	)
	print(
	f"{r.shape}: flashrt={r.flashrt_us:.3f}us, eager={r.torch_eager_us:.3f}us, "
	f"vs_eager={r.speedup_vs_eager:.2f}x{compile_part}, "
	f"p99_abs={r.p99_abs:.4f}, max_abs={r.max_abs:.4f}, {r.status}"
	)

	payload = {
	"backend": args.backend,
	"torch": torch.__version__,
	"device": torch.cuda.get_device_name(0),
	"results": [asdict(r) for r in results],
	}
	if args.output:
	args.output.parent.mkdir(parents=True, exist_ok=True)
	args.output.write_text(json.dumps(payload, indent=2), encoding="utf-8")
	if args.markdown:
	args.markdown.parent.mkdir(parents=True, exist_ok=True)
	write_markdown(args.markdown, results, args)


	if __name__ == "__main__":
	main()