Spaces:

Jackoatmon
/

feather-runtime

Runtime error

App Files Files Community

feather-runtime / overlay /subsystems /htm.py

Jackoatmon

Update Feather H200 runtime: Nemotron streaming and HTM force-CPU canary fixes

c2bf4b6 verified 15 days ago

raw

history blame contribute delete

19.7 kB

	"""
	HTM torch wrapper around the pyo3 ``htm_rust`` crate.

	Exposes ``HTMLayer``, a ``torch.nn.Module`` that batches calls to
	``htm_rust.HTMRegion.step`` across a ``(B, T, input_bits)`` boolean SDR stream
	and returns ``(B, T, n_columns + 1)`` where the last channel is the anomaly
	score. HTM learning is Hebbian (not gradient), so the wrapper runs under
	``torch.no_grad()``. Downstream layers carry gradients back to the embedding
	via their own learnable projection from the binary column output.

	Per-sequence state semantics
	---------------------------
	Training-time forward passes are independent windows of tokens (re-sampled
	every step), so carrying TM state across calls would mix unrelated contexts.
	This layer calls ``reset()`` on every region at the top of ``forward``; the
	TM learns within-window temporal patterns only. Users that want cross-window
	continuity (e.g. eval over a long document) should instead construct the
	layer and drive ``step_stream`` themselves (not implemented here; the
	single-forward contract is sufficient for the autoresearch loop).

	Device handling
	---------------
	``htm_rust`` runs on CPU. If ``sdr`` lives on CUDA we pay a
	``sdr.cpu().numpy()`` round-trip per forward. The return tensor is cast back
	to ``sdr.device``. For expected use (batch<=32, T<=2048, bits=16384) this
	copy is small compared to the SP/TM compute.
	"""

	from __future__ import annotations

	import time
	from concurrent.futures import ThreadPoolExecutor

	import numpy as np
	import torch
	import torch.nn as nn

	import htm_rust

	# step_many releases the GIL for the whole pass, so multiple threads can
	# truly run regions in parallel — wall-clock scales with B up to CPU cores.
	_HTM_HAS_STEP_MANY = hasattr(htm_rust.HTMRegion, "step_many")
	# GPU backend: built with `maturin develop --features gpu`. One CUDA region
	# per batch slot, persistent device state for SP synapses. Transparent
	# fallback to CPU when not available.
	_HTM_HAS_GPU = hasattr(htm_rust, "HTMRegionGpu")
	# Zero-copy CUDA path: consumes torch CUDA tensors directly via the
	# __cuda_array_interface__ protocol, skipping the sdr.cpu()/numpy round-trip
	# and the D2H of outputs. Huge win when the input SDR already lives on GPU
	# (which is the train.py hot path — retina is a device buffer).
	_HTM_HAS_CAI = _HTM_HAS_GPU and hasattr(htm_rust.HTMRegionGpu, "step_many_cuda")
	# Fused megakernel path: collapses all T timesteps + SP + TM into a single
	# CUDA launch per forward. Replaces global top-K with per-column threshold
	# inhibition (see htm_rust/docs/GPU_HTM.md §Fused Kernel).
	# Opt-in via env var (default on when available).
	import os as _os_fused
	_HTM_HAS_FUSED = _HTM_HAS_GPU and hasattr(htm_rust.HTMRegionGpu, "step_many_fused_cuda")
	_HTM_USE_FUSED = _HTM_HAS_FUSED and bool(int(_os_fused.environ.get("HYDRA_HTM_FUSED", "1")))
	_HTM_USE_BATCHED_FUSED = _HTM_USE_FUSED and bool(int(_os_fused.environ.get("HYDRA_HTM_BATCHED_FUSED", "1")))


	class HTMLayer(nn.Module):
	"""Batched torch wrapper around ``htm_rust.HTMRegion``.

	One independent region per batch slot so temporal memory learns
	sequence-local patterns without cross-batch bleed. Regions grow
	lazily if a larger batch shows up.

	Output is ``(B, T, n_columns + 1)``: first ``n_columns`` channels are
	the binary active-column mask (float32 0/1) and the last channel is
	the per-timestep anomaly score in [0, 1].
	"""

	def __init__(
	self,
	input_bits: int = 16384,
	n_columns: int = 2048,
	cells_per_column: int = 32,
	batch_size: int = 1,
	seed: int = 42,
	learn: bool = True,
	reset_each_forward: bool = True,
	use_gpu: bool \| None = None,
	) -> None:
	super().__init__()
	self.input_bits = input_bits
	self.n_columns = n_columns
	self.cells_per_column = cells_per_column
	self.learn = learn
	self.reset_each_forward = reset_each_forward
	self._seed_base = seed
	# Learn gating: HTM learn kernels (tm_punish, tm_learn_reinforce, tm_grow)
	# are 56% of total HTM CUDA time. Gating them to run every N forwards
	# instead of every forward cuts HTM cost ~2x. Hebbian learning still
	# converges since the EMA accumulates over many calls. Env:
	# HYDRA_HTM_LEARN_EVERY=N (default 1 = every forward, 0 = disabled).
	import os as _os
	self._learn_every = max(1, int(_os.environ.get("HYDRA_HTM_LEARN_EVERY", "1")))
	self._forward_counter = 0
	# GPU backend gate. Default: auto-detect — use GPU when the pyo3
	# module was built with --features gpu AND CUDA is actually usable.
	# HYDRA_FORCE_HTM_CPU=1 is an operational safety valve for paid remote
	# canaries when the compiled CUDA HTM backend is present but unstable on
	# a specific hardware/runtime combination.
	force_cpu = _os.environ.get("HYDRA_FORCE_HTM_CPU", "0") == "1"
	if use_gpu is None:
	use_gpu = (not force_cpu) and _HTM_HAS_GPU and torch.cuda.is_available()
	elif use_gpu and not _HTM_HAS_GPU:
	raise RuntimeError(
	"HTMLayer(use_gpu=True) but htm_rust was not built with "
	"--features gpu. Re-run `maturin develop --features gpu`."
	)
	elif use_gpu and force_cpu:
	use_gpu = False
	self._use_gpu = bool(use_gpu)
	cls = htm_rust.HTMRegionGpu if self._use_gpu else htm_rust.HTMRegion
	self._region_cls = cls
	self._regions = [
	cls(input_bits, n_columns, cells_per_column, seed + i)
	for i in range(batch_size)
	]
	self.register_buffer("_dummy", torch.zeros(1), persistent=False)
	import os as _os
	self._htm_pool = ThreadPoolExecutor(max_workers=min(_os.cpu_count() or 4, 16))

	def _ensure_regions(self, B: int) -> None:
	while len(self._regions) < B:
	idx = len(self._regions)
	self._regions.append(
	self._region_cls(
	self.input_bits,
	self.n_columns,
	self.cells_per_column,
	self._seed_base + idx,
	)
	)

	def reset(self) -> None:
	"""Clear TM predictive state on every region (keeps SP synapses)."""
	for r in self._regions:
	r.reset()

	@torch.no_grad()
	def forward(self, sdr: torch.Tensor) -> torch.Tensor:
	B, T, D = sdr.shape
	if D != self.input_bits:
	raise ValueError(f"expected input_bits={self.input_bits}, got {D}")
	self._ensure_regions(B)
	if self.reset_each_forward:
	self.reset()

	# Learn-gate: run learn kernels only every N forwards (skips 56% of
	# HTM CUDA time on skip-forwards; Hebbian EMA still converges).
	self._forward_counter += 1
	learn = bool(
	self.learn
	and self.training
	and (self._forward_counter % self._learn_every == 0)
	)

	# Zero-copy CUDA hot path. SDR already lives on GPU (retina buffer),
	# so we skip sdr.cpu()/numpy round-trip AND the output D2H. The Rust
	# kernel writes directly into torch-owned CUDA tensors via CAI.
	# Gives 5-10x tok/s on train.py vs the numpy path below.
	if _HTM_HAS_CAI and self._use_gpu and sdr.is_cuda:
	sdr_u8 = sdr.contiguous().to(torch.uint8) if sdr.dtype != torch.uint8 else sdr.contiguous()
	cols_out = torch.empty((B, T, self.n_columns), dtype=torch.uint8, device=sdr.device)
	anom_out = torch.empty((B, T), dtype=torch.float32, device=sdr.device)
	# Pick fused (1 launch) or legacy (12*T launches) path.
	if _HTM_USE_FUSED:
	for b in range(B):
	self._regions[b].step_many_fused_cuda(
	sdr_u8[b].__cuda_array_interface__,
	cols_out[b].__cuda_array_interface__,
	anom_out[b].__cuda_array_interface__,
	learn,
	)
	else:
	for b in range(B):
	self._regions[b].step_many_cuda(
	sdr_u8[b].__cuda_array_interface__,
	cols_out[b].__cuda_array_interface__,
	anom_out[b].__cuda_array_interface__,
	learn,
	)
	# Assemble (B, T, n_cols+1) — keep bf16-friendly float32.
	return torch.cat((cols_out.to(torch.float32), anom_out.unsqueeze(-1)), dim=-1)

	# Fallback: CPU / numpy path. Kept for CPU-input case and for
	# builds without CAI support.
	sdr_np = sdr.detach().cpu().contiguous().to(torch.bool).numpy()
	out = np.zeros((B, T, self.n_columns + 1), dtype=np.float32)

	def _process_one(b: int) -> None:
	region = self._regions[b]
	if self._use_gpu:
	cols, anom = region.step_many_gpu(sdr_np[b], learn)
	out[b, :, : self.n_columns] = cols
	out[b, :, self.n_columns] = anom
	elif _HTM_HAS_STEP_MANY:
	# Single Rust call: T steps with GIL released for the whole pass.
	cols, anom = region.step_many(sdr_np[b], learn) # cols (T, n_cols), anom (T,)
	out[b, :, : self.n_columns] = cols
	out[b, :, self.n_columns] = anom
	else:
	for t in range(T):
	active_cols, _ac, _pc, anomaly = region.step(sdr_np[b, t], learn)
	out[b, t, : self.n_columns] = active_cols
	out[b, t, self.n_columns] = float(anomaly)

	if B == 1:
	_process_one(0)
	elif self._use_gpu:
	# GPU regions share the CUDA context; serialise to avoid contention
	# for stream 0. Per-region latency is dominated by kernel compute,
	# not threadable on a single stream cheaply — future work: one
	# CUDA stream per region.
	for b in range(B):
	_process_one(b)
	else:
	# Each thread runs in pure Rust under py.allow_threads, so they
	# parallelise to wall-clock min(B, CPU_cores).
	list(self._htm_pool.map(_process_one, range(B)))

	return torch.from_numpy(out).to(sdr.device)

	def forward_async(self, sdr: torch.Tensor):
	"""Submit HTM work and return a handle awaitable via ``forward_await``.

	On the CAI zero-copy path (GPU tensor in, GPU region), the Rust
	CUDA kernels are launched on cudarc's internal stream and control
	returns immediately — no device synchronization. The caller's
	next GPU ops (embedding lookup, Mamba forward, etc.) are enqueued
	on PyTorch's default stream and can execute while HTM kernels run
	on the cudarc stream. ``forward_await`` performs the cross-stream
	sync (via ``device_sync``) and assembles the output tensor only
	when the result is actually consumed.

	For cooperative kernels (``step_many_fused_cuda``) the GPU can only
	run one cooperative launch at a time, so kernel-level overlap with
	default-stream work is limited. The win is **CPU-side launch
	overlap**: instead of the CPU blocking ~10 ms waiting for HTM
	before it can even enqueue wte/mamba, it enqueues everything up
	front and the GPU executes back-to-back without CPU stalls.

	On the legacy CPU/numpy path, work is dispatched to a thread pool
	as before."""
	B, T, D = sdr.shape
	if D != self.input_bits:
	raise ValueError(f"expected input_bits={self.input_bits}, got {D}")
	self._ensure_regions(B)
	if self.reset_each_forward:
	self.reset()
	learn = bool(self.learn and self.training)

	if _HTM_HAS_CAI and self._use_gpu and sdr.is_cuda:
	sdr_u8 = sdr.contiguous().to(torch.uint8) if sdr.dtype != torch.uint8 else sdr.contiguous()
	cols_out = torch.empty((B, T, self.n_columns), dtype=torch.uint8, device=sdr.device)
	anom_out = torch.empty((B, T), dtype=torch.float32, device=sdr.device)
	# ONE cooperative kernel launch for all B regions. Breaks past
	# the CUDA cooperative-kernel device-level serialization (only
	# one cooperative kernel runs at a time). A single launch with
	# grid.y = B processes all regions concurrently — ~B× speedup.
	# Falls back to sequential dispatch if the batched entry isn't
	# available (older htm_rust wheel).
	if _HTM_USE_BATCHED_FUSED and hasattr(htm_rust, "step_batch_fused_cuda"):
	# Slice self._regions to match B: _ensure_regions may have
	# allocated more regions than the current batch size needs
	# (e.g. factual eval uses smaller batches than training).
	try:
	htm_rust.step_batch_fused_cuda(
	self._regions[:B],
	[sdr_u8[b].__cuda_array_interface__ for b in range(B)],
	[cols_out[b].__cuda_array_interface__ for b in range(B)],
	[anom_out[b].__cuda_array_interface__ for b in range(B)],
	learn,
	)
	except RuntimeError as _e:
	if "COOPERATIVE_LAUNCH_TOO_LARGE" in str(_e):
	# Batch too large for cooperative grid. Fall back to
	# sequential per-region fused launches (each B=1).
	for b in range(B):
	self._regions[b].step_many_fused_cuda(
	sdr_u8[b].__cuda_array_interface__,
	cols_out[b].__cuda_array_interface__,
	anom_out[b].__cuda_array_interface__,
	learn,
	)
	else:
	raise
	elif _HTM_USE_FUSED:
	for b in range(B):
	self._regions[b].step_many_fused_cuda(
	sdr_u8[b].__cuda_array_interface__,
	cols_out[b].__cuda_array_interface__,
	anom_out[b].__cuda_array_interface__,
	learn,
	)
	else:
	for b in range(B):
	self._regions[b].step_many_cuda(
	sdr_u8[b].__cuda_array_interface__,
	cols_out[b].__cuda_array_interface__,
	anom_out[b].__cuda_array_interface__,
	learn,
	)
	# NO sync here — kernels are in-flight on cudarc's stream.
	# forward_await() will sync before the output is consumed.
	return {
	'cuda_deferred': True,
	'cols_out': cols_out,
	'anom_out': anom_out,
	'region0': self._regions[0],
	}

	sdr_np = sdr.detach().cpu().contiguous().to(torch.bool).numpy()
	out = np.zeros((B, T, self.n_columns + 1), dtype=np.float32)

	def _process_one(b):
	region = self._regions[b]
	if self._use_gpu:
	cols, anom = region.step_many_gpu(sdr_np[b], learn)
	out[b, :, : self.n_columns] = cols
	out[b, :, self.n_columns] = anom
	elif _HTM_HAS_STEP_MANY:
	cols, anom = region.step_many(sdr_np[b], learn)
	out[b, :, : self.n_columns] = cols
	out[b, :, self.n_columns] = anom
	else:
	for t in range(T):
	active_cols, _ac, _pc, anomaly = region.step(sdr_np[b, t], learn)
	out[b, t, : self.n_columns] = active_cols
	out[b, t, self.n_columns] = float(anomaly)

	fut = self._htm_pool.submit(lambda: [_process_one(b) for b in range(B)])
	return {'fut': fut, 'out': out, 'device': sdr.device}

	def forward_await(self, handle) -> torch.Tensor:
	if handle.get('cuda_deferred'):
	# Cross-stream sync: block until cudarc stream finishes HTM
	# kernels so the output tensors are safe to read on the
	# default stream.
	region0 = handle['region0']
	if hasattr(region0, "device_sync"):
	region0.device_sync()
	else:
	torch.cuda.synchronize()
	cols_out = handle['cols_out']
	anom_out = handle['anom_out']
	return torch.cat(
	(cols_out.to(torch.float32), anom_out.unsqueeze(-1)), dim=-1
	)
	if 'cuda_result' in handle:
	return handle['cuda_result']
	handle['fut'].result()
	return torch.from_numpy(handle['out']).to(handle['device'])


	if __name__ == "__main__":
	torch.manual_seed(0)

	# Smoke test: (B=2, T=4, D=16384) random 2%-sparse SDR
	B, T, D = 2, 4, 16384
	n_columns = 2048
	target_active_in = int(D * 0.02) # 327

	layer = HTMLayer(
	input_bits=D,
	n_columns=n_columns,
	cells_per_column=32,
	batch_size=B,
	seed=42,
	learn=True,
	)
	layer.train()

	rng = np.random.default_rng(0)
	sdr = np.zeros((B, T, D), dtype=bool)
	for b in range(B):
	for t in range(T):
	idx = rng.choice(D, size=target_active_in, replace=False)
	sdr[b, t, idx] = True
	sdr_t = torch.from_numpy(sdr)

	t0 = time.perf_counter()
	out = layer(sdr_t)
	dt_first = time.perf_counter() - t0

	assert out.shape == (B, T, n_columns + 1), f"shape {out.shape}"
	assert out.dtype == torch.float32, f"dtype {out.dtype}"

	active_cols = out[..., :n_columns]
	anomaly = out[..., n_columns]

	col_sums = active_cols.sum(dim=-1) # (B, T)
	mean_active = col_sums.float().mean().item()
	expected = n_columns * 0.02 # ≈ 40.96
	assert 20 <= mean_active <= 60, (
	f"active columns per step out of 2% band: {mean_active:.1f} (expected ~{expected:.1f})"
	)

	# t=0 has no TM prediction → anomaly = 1.0 on every batch slot.
	assert torch.allclose(anomaly[:, 0], torch.ones(B)), f"t=0 anomaly {anomaly[:, 0]}"

	# Second forward on same (reset) layer: identical shapes, deterministic re-run possible.
	t0 = time.perf_counter()
	out2 = layer(sdr_t)
	dt_second = time.perf_counter() - t0
	assert out2.shape == out.shape

	# Repeating-sequence anomaly decay check — one region, T=8 repeats of same pattern.
	rep_layer = HTMLayer(
	input_bits=D,
	n_columns=n_columns,
	batch_size=1,
	seed=7,
	learn=True,
	)
	rep_layer.train()
	base = torch.zeros(D, dtype=torch.bool)
	idx = rng.choice(D, size=target_active_in, replace=False)
	base[idx] = True
	rep = base.unsqueeze(0).unsqueeze(0).expand(1, 16, D).clone()
	rep_out = rep_layer(rep)
	rep_anom = rep_out[0, :, n_columns]
	assert rep_anom[0].item() > 0.5, f"anomaly at t=0 should be high, got {rep_anom[0]:.3f}"
	assert rep_anom[-1].item() < rep_anom[0].item(), (
	f"anomaly should decay on repeats: first={rep_anom[0]:.3f} last={rep_anom[-1]:.3f}"
	)

	print("[OK] shape:", tuple(out.shape))
	print(f"[OK] mean active cols/step: {mean_active:.2f} (target ~{expected:.1f})")
	print(f"[OK] t=0 anomaly = 1.0 on all batch slots")
	print(f"[OK] repeating-sequence anomaly: first={rep_anom[0]:.3f} -> last={rep_anom[-1]:.3f}")
	print(f"[OK] forward wall-clock: first={dt_first1000:.1f}ms second={dt_second1000:.1f}ms "
	f"on (B={B}, T={T}, D={D})")