Jackoatmon's picture
Update Feather a10g-large training runtime image
fc102e3 verified
"""Microbenchmark: CPU vs GPU HTMLayer forward at HYDRA training sizes.
Usage:
source .venv/bin/activate
export LD_LIBRARY_PATH=/usr/lib/wsl/lib:/usr/local/cuda-12.1/lib64:$LD_LIBRARY_PATH
python htm_rust/bench_gpu.py
"""
import os
import sys
import time
# Ensure /home/mikeb/work/feather is on sys.path so `subsystems` imports.
_FEATHER = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
if _FEATHER not in sys.path:
sys.path.insert(0, _FEATHER)
import numpy as np
import torch
from subsystems.htm import HTMLayer
def bench(layer: HTMLayer, sdr: torch.Tensor, warmup: int = 1, iters: int = 3) -> float:
"""Return mean ms/forward."""
for _ in range(warmup):
_ = layer(sdr)
if torch.cuda.is_available():
torch.cuda.synchronize()
t0 = time.perf_counter()
for _ in range(iters):
_ = layer(sdr)
if torch.cuda.is_available():
torch.cuda.synchronize()
dt = time.perf_counter() - t0
return dt * 1000 / iters
def main() -> None:
# HYDRA training config: B=8, T=2048, bits=16384, cols=2048.
B, T, D = int(os.environ.get("B", 8)), int(os.environ.get("T", 2048)), 16384
n_cols = 2048
print(f"config: B={B} T={T} D={D} n_cols={n_cols}")
print(f"torch: {torch.__version__} cuda={torch.cuda.is_available()}")
# Build a fixed sparse SDR once.
rng = np.random.default_rng(0)
sdr = np.zeros((B, T, D), dtype=bool)
on = int(D * 0.02)
for b in range(B):
for t in range(T):
idx = rng.choice(D, size=on, replace=False)
sdr[b, t, idx] = True
sdr_t = torch.from_numpy(sdr)
# CPU baseline.
print("\n--- CPU ---")
cpu_layer = HTMLayer(
input_bits=D, n_columns=n_cols, cells_per_column=32,
batch_size=B, seed=42, use_gpu=False,
)
cpu_layer.train()
cpu_ms = bench(cpu_layer, sdr_t, warmup=1, iters=2)
print(f"CPU: {cpu_ms:.1f} ms/forward ({cpu_ms/T:.2f} ms/step × T={T})")
# GPU.
print("\n--- GPU ---")
gpu_layer = HTMLayer(
input_bits=D, n_columns=n_cols, cells_per_column=32,
batch_size=B, seed=42, use_gpu=True,
)
gpu_layer.train()
sdr_cuda = sdr_t.cuda()
gpu_ms = bench(gpu_layer, sdr_cuda, warmup=1, iters=2)
print(f"GPU: {gpu_ms:.1f} ms/forward ({gpu_ms/T:.2f} ms/step × T={T})")
print(f"\nSpeedup: {cpu_ms / gpu_ms:.2f}x")
if __name__ == "__main__":
main()