Spaces:
Runtime error
Runtime error
| """Microbenchmark: CPU vs GPU HTMLayer forward at HYDRA training sizes. | |
| Usage: | |
| source .venv/bin/activate | |
| export LD_LIBRARY_PATH=/usr/lib/wsl/lib:/usr/local/cuda-12.1/lib64:$LD_LIBRARY_PATH | |
| python htm_rust/bench_gpu.py | |
| """ | |
| import os | |
| import sys | |
| import time | |
| # Ensure /home/mikeb/work/feather is on sys.path so `subsystems` imports. | |
| _FEATHER = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | |
| if _FEATHER not in sys.path: | |
| sys.path.insert(0, _FEATHER) | |
| import numpy as np | |
| import torch | |
| from subsystems.htm import HTMLayer | |
| def bench(layer: HTMLayer, sdr: torch.Tensor, warmup: int = 1, iters: int = 3) -> float: | |
| """Return mean ms/forward.""" | |
| for _ in range(warmup): | |
| _ = layer(sdr) | |
| if torch.cuda.is_available(): | |
| torch.cuda.synchronize() | |
| t0 = time.perf_counter() | |
| for _ in range(iters): | |
| _ = layer(sdr) | |
| if torch.cuda.is_available(): | |
| torch.cuda.synchronize() | |
| dt = time.perf_counter() - t0 | |
| return dt * 1000 / iters | |
| def main() -> None: | |
| # HYDRA training config: B=8, T=2048, bits=16384, cols=2048. | |
| B, T, D = int(os.environ.get("B", 8)), int(os.environ.get("T", 2048)), 16384 | |
| n_cols = 2048 | |
| print(f"config: B={B} T={T} D={D} n_cols={n_cols}") | |
| print(f"torch: {torch.__version__} cuda={torch.cuda.is_available()}") | |
| # Build a fixed sparse SDR once. | |
| rng = np.random.default_rng(0) | |
| sdr = np.zeros((B, T, D), dtype=bool) | |
| on = int(D * 0.02) | |
| for b in range(B): | |
| for t in range(T): | |
| idx = rng.choice(D, size=on, replace=False) | |
| sdr[b, t, idx] = True | |
| sdr_t = torch.from_numpy(sdr) | |
| # CPU baseline. | |
| print("\n--- CPU ---") | |
| cpu_layer = HTMLayer( | |
| input_bits=D, n_columns=n_cols, cells_per_column=32, | |
| batch_size=B, seed=42, use_gpu=False, | |
| ) | |
| cpu_layer.train() | |
| cpu_ms = bench(cpu_layer, sdr_t, warmup=1, iters=2) | |
| print(f"CPU: {cpu_ms:.1f} ms/forward ({cpu_ms/T:.2f} ms/step × T={T})") | |
| # GPU. | |
| print("\n--- GPU ---") | |
| gpu_layer = HTMLayer( | |
| input_bits=D, n_columns=n_cols, cells_per_column=32, | |
| batch_size=B, seed=42, use_gpu=True, | |
| ) | |
| gpu_layer.train() | |
| sdr_cuda = sdr_t.cuda() | |
| gpu_ms = bench(gpu_layer, sdr_cuda, warmup=1, iters=2) | |
| print(f"GPU: {gpu_ms:.1f} ms/forward ({gpu_ms/T:.2f} ms/step × T={T})") | |
| print(f"\nSpeedup: {cpu_ms / gpu_ms:.2f}x") | |
| if __name__ == "__main__": | |
| main() | |