Instructions to use galqiwi/hadamard_transform_kernels with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Kernels
How to use galqiwi/hadamard_transform_kernels with Kernels:
# !pip install kernels from kernels import get_kernel kernel = get_kernel("galqiwi/hadamard_transform_kernels") - Notebooks
- Google Colab
- Kaggle
| """Throughput benchmark vs torch.clone (memcpy lower bound).""" | |
| import math | |
| import time | |
| import torch | |
| from hadamard import hadamard_transform | |
| def _bench(fn, x, iters=200, warmup=20): | |
| for _ in range(warmup): | |
| fn(x) | |
| torch.cuda.synchronize() | |
| t0 = time.perf_counter() | |
| for _ in range(iters): | |
| fn(x) | |
| torch.cuda.synchronize() | |
| return (time.perf_counter() - t0) / iters * 1e6 # microseconds | |
| def main(): | |
| device = "cuda" | |
| for dtype in (torch.float16, torch.bfloat16, torch.float32): | |
| for dim in (512, 4096, 16384, 32768): | |
| x = torch.randn(32 * 1024, dim, dtype=dtype, device=device) | |
| scale = 1.0 / math.sqrt(dim) | |
| t_had = _bench(lambda t: hadamard_transform(t, scale=scale), x) | |
| t_cpy = _bench(torch.clone, x) | |
| print( | |
| f"dtype={str(dtype).split('.')[-1]:>9} dim={dim:>5} " | |
| f"hadamard={t_had:7.1f}us clone={t_cpy:7.1f}us " | |
| f"ratio={t_had / t_cpy:.2f}x" | |
| ) | |
| if __name__ == "__main__": | |
| main() | |