"""Throughput benchmark vs torch.clone (memcpy lower bound)."""

import math
import time

import torch

from hadamard import hadamard_transform


def _bench(fn, x, iters=200, warmup=20):
    for _ in range(warmup):
        fn(x)
    torch.cuda.synchronize()
    t0 = time.perf_counter()
    for _ in range(iters):
        fn(x)
    torch.cuda.synchronize()
    return (time.perf_counter() - t0) / iters * 1e6  # microseconds


def main():
    device = "cuda"
    for dtype in (torch.float16, torch.bfloat16, torch.float32):
        for dim in (512, 4096, 16384, 32768):
            x = torch.randn(32 * 1024, dim, dtype=dtype, device=device)
            scale = 1.0 / math.sqrt(dim)
            t_had = _bench(lambda t: hadamard_transform(t, scale=scale), x)
            t_cpy = _bench(torch.clone, x)
            print(
                f"dtype={str(dtype).split('.')[-1]:>9} dim={dim:>5} "
                f"hadamard={t_had:7.1f}us  clone={t_cpy:7.1f}us  "
                f"ratio={t_had / t_cpy:.2f}x"
            )


if __name__ == "__main__":
    main()