Instructions to use galqiwi/hadamard_transform_kernels with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Kernels
How to use galqiwi/hadamard_transform_kernels with Kernels:
# !pip install kernels from kernels import get_kernel kernel = get_kernel("galqiwi/hadamard_transform_kernels") - Notebooks
- Google Colab
- Kaggle
File size: 1,058 Bytes
e4a8c54 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 | """Throughput benchmark vs torch.clone (memcpy lower bound)."""
import math
import time
import torch
from hadamard import hadamard_transform
def _bench(fn, x, iters=200, warmup=20):
for _ in range(warmup):
fn(x)
torch.cuda.synchronize()
t0 = time.perf_counter()
for _ in range(iters):
fn(x)
torch.cuda.synchronize()
return (time.perf_counter() - t0) / iters * 1e6 # microseconds
def main():
device = "cuda"
for dtype in (torch.float16, torch.bfloat16, torch.float32):
for dim in (512, 4096, 16384, 32768):
x = torch.randn(32 * 1024, dim, dtype=dtype, device=device)
scale = 1.0 / math.sqrt(dim)
t_had = _bench(lambda t: hadamard_transform(t, scale=scale), x)
t_cpy = _bench(torch.clone, x)
print(
f"dtype={str(dtype).split('.')[-1]:>9} dim={dim:>5} "
f"hadamard={t_had:7.1f}us clone={t_cpy:7.1f}us "
f"ratio={t_had / t_cpy:.2f}x"
)
if __name__ == "__main__":
main()
|