Kernels
cuda
hadamard
galqiwi's picture
Re-initial source (hadamard kebab name, BSD-3-Clause)
e4a8c54 verified
"""Throughput benchmark vs torch.clone (memcpy lower bound)."""
import math
import time
import torch
from hadamard import hadamard_transform
def _bench(fn, x, iters=200, warmup=20):
for _ in range(warmup):
fn(x)
torch.cuda.synchronize()
t0 = time.perf_counter()
for _ in range(iters):
fn(x)
torch.cuda.synchronize()
return (time.perf_counter() - t0) / iters * 1e6 # microseconds
def main():
device = "cuda"
for dtype in (torch.float16, torch.bfloat16, torch.float32):
for dim in (512, 4096, 16384, 32768):
x = torch.randn(32 * 1024, dim, dtype=dtype, device=device)
scale = 1.0 / math.sqrt(dim)
t_had = _bench(lambda t: hadamard_transform(t, scale=scale), x)
t_cpy = _bench(torch.clone, x)
print(
f"dtype={str(dtype).split('.')[-1]:>9} dim={dim:>5} "
f"hadamard={t_had:7.1f}us clone={t_cpy:7.1f}us "
f"ratio={t_had / t_cpy:.2f}x"
)
if __name__ == "__main__":
main()