"""Microbenchmark: CPU vs GPU HTMLayer forward at HYDRA training sizes. Usage: source .venv/bin/activate export LD_LIBRARY_PATH=/usr/lib/wsl/lib:/usr/local/cuda-12.1/lib64:$LD_LIBRARY_PATH python htm_rust/bench_gpu.py """ import os import sys import time # Ensure /home/mikeb/work/feather is on sys.path so `subsystems` imports. _FEATHER = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) if _FEATHER not in sys.path: sys.path.insert(0, _FEATHER) import numpy as np import torch from subsystems.htm import HTMLayer def bench(layer: HTMLayer, sdr: torch.Tensor, warmup: int = 1, iters: int = 3) -> float: """Return mean ms/forward.""" for _ in range(warmup): _ = layer(sdr) if torch.cuda.is_available(): torch.cuda.synchronize() t0 = time.perf_counter() for _ in range(iters): _ = layer(sdr) if torch.cuda.is_available(): torch.cuda.synchronize() dt = time.perf_counter() - t0 return dt * 1000 / iters def main() -> None: # HYDRA training config: B=8, T=2048, bits=16384, cols=2048. B, T, D = int(os.environ.get("B", 8)), int(os.environ.get("T", 2048)), 16384 n_cols = 2048 print(f"config: B={B} T={T} D={D} n_cols={n_cols}") print(f"torch: {torch.__version__} cuda={torch.cuda.is_available()}") # Build a fixed sparse SDR once. rng = np.random.default_rng(0) sdr = np.zeros((B, T, D), dtype=bool) on = int(D * 0.02) for b in range(B): for t in range(T): idx = rng.choice(D, size=on, replace=False) sdr[b, t, idx] = True sdr_t = torch.from_numpy(sdr) # CPU baseline. print("\n--- CPU ---") cpu_layer = HTMLayer( input_bits=D, n_columns=n_cols, cells_per_column=32, batch_size=B, seed=42, use_gpu=False, ) cpu_layer.train() cpu_ms = bench(cpu_layer, sdr_t, warmup=1, iters=2) print(f"CPU: {cpu_ms:.1f} ms/forward ({cpu_ms/T:.2f} ms/step × T={T})") # GPU. print("\n--- GPU ---") gpu_layer = HTMLayer( input_bits=D, n_columns=n_cols, cells_per_column=32, batch_size=B, seed=42, use_gpu=True, ) gpu_layer.train() sdr_cuda = sdr_t.cuda() gpu_ms = bench(gpu_layer, sdr_cuda, warmup=1, iters=2) print(f"GPU: {gpu_ms:.1f} ms/forward ({gpu_ms/T:.2f} ms/step × T={T})") print(f"\nSpeedup: {cpu_ms / gpu_ms:.2f}x") if __name__ == "__main__": main()