"""Throughput benchmark vs torch.clone (memcpy lower bound).""" import math import time import torch from hadamard import hadamard_transform def _bench(fn, x, iters=200, warmup=20): for _ in range(warmup): fn(x) torch.cuda.synchronize() t0 = time.perf_counter() for _ in range(iters): fn(x) torch.cuda.synchronize() return (time.perf_counter() - t0) / iters * 1e6 # microseconds def main(): device = "cuda" for dtype in (torch.float16, torch.bfloat16, torch.float32): for dim in (512, 4096, 16384, 32768): x = torch.randn(32 * 1024, dim, dtype=dtype, device=device) scale = 1.0 / math.sqrt(dim) t_had = _bench(lambda t: hadamard_transform(t, scale=scale), x) t_cpy = _bench(torch.clone, x) print( f"dtype={str(dtype).split('.')[-1]:>9} dim={dim:>5} " f"hadamard={t_had:7.1f}us clone={t_cpy:7.1f}us " f"ratio={t_had / t_cpy:.2f}x" ) if __name__ == "__main__": main()