Re-initial source (hadamard kebab name, BSD-3-Clause)

e4a8c54 verified 7 days ago

1.06 kB

	"""Throughput benchmark vs torch.clone (memcpy lower bound)."""

	import math
	import time

	import torch

	from hadamard import hadamard_transform


	def _bench(fn, x, iters=200, warmup=20):
	for _ in range(warmup):
	fn(x)
	torch.cuda.synchronize()
	t0 = time.perf_counter()
	for _ in range(iters):
	fn(x)
	torch.cuda.synchronize()
	return (time.perf_counter() - t0) / iters * 1e6 # microseconds


	def main():
	device = "cuda"
	for dtype in (torch.float16, torch.bfloat16, torch.float32):
	for dim in (512, 4096, 16384, 32768):
	x = torch.randn(32 * 1024, dim, dtype=dtype, device=device)
	scale = 1.0 / math.sqrt(dim)
	t_had = _bench(lambda t: hadamard_transform(t, scale=scale), x)
	t_cpy = _bench(torch.clone, x)
	print(
	f"dtype={str(dtype).split('.')[-1]:>9} dim={dim:>5} "
	f"hadamard={t_had:7.1f}us clone={t_cpy:7.1f}us "
	f"ratio={t_had / t_cpy:.2f}x"
	)


	if __name__ == "__main__":
	main()