Spaces:

Jackoatmon
/

feather-runtime

Runtime error

App Files Files Community

feather-runtime / overlay /tests /test_engram.py

Jackoatmon

Update Feather h200 training runtime image

e317e25 verified 26 days ago

raw

history blame contribute delete

6.73 kB

	"""Tests for GPUEngram Sparse Modern Hopfield retrieval path.

	Tests are written first (TDD) against the new matmul-based retrieval.
	Run with: pytest tests/test_engram.py -v
	"""
	from __future__ import annotations

	import math

	import pytest
	import torch
	import torch.nn as nn


	# ---------------------------------------------------------------------------
	# Helpers
	# ---------------------------------------------------------------------------

	def _make_engram(d_model: int = 64, n_columns: int = 1024, hebbian_boost: bool = False):
	from hydra.engram import GPUEngram
	m = GPUEngram(d_model=d_model, n_columns=n_columns, hebbian_boost=hebbian_boost)
	m.eval()
	return m


	# ---------------------------------------------------------------------------
	# test_forward_shape
	# ---------------------------------------------------------------------------

	def test_forward_shape():
	"""Output tensor matches input shape; hit_rate is a scalar."""
	B, T, D = 2, 16, 64
	m = _make_engram(d_model=D, n_columns=1024)
	x = torch.randn(B, T, D)
	token_ids = torch.randint(0, 1000, (B, T))
	out, hit_rate = m(x, token_ids)
	assert out.shape == (B, T, D), f"Expected ({B},{T},{D}), got {out.shape}"
	assert hit_rate.ndim == 0, f"hit_rate should be scalar, got shape {hit_rate.shape}"


	# ---------------------------------------------------------------------------
	# test_gradient_flow
	# ---------------------------------------------------------------------------

	def test_gradient_flow():
	"""Backprop through the Hopfield matmul path must reach self.memory.grad.

	The old scatter-gather path used self.memory[indices] which DID produce
	gradients only for indexed rows. The new path (scores = x @ memory.T then
	weights @ memory) creates a full matmul, so every column gets a non-zero
	gradient signal (on a random batch where all keys are attended to).
	"""
	D, N = 64, 128
	m = _make_engram(d_model=D, n_columns=N)
	m.train()

	x = torch.randn(2, 8, D, requires_grad=True)
	token_ids = torch.randint(0, 100, (2, 8))
	out, _ = m(x, token_ids)
	loss = out.sum()
	loss.backward()

	assert m.memory.grad is not None, "self.memory.grad must be non-None after backward"
	assert m.memory.grad.abs().sum() > 0, "self.memory.grad must have non-zero entries"


	# ---------------------------------------------------------------------------
	# test_sparsity
	# ---------------------------------------------------------------------------

	def test_sparsity():
	"""At least 95% of alpha-entmax attention weights must be exactly zero.

	entmax-1.5 (alpha-entmax) produces truly sparse distributions. Sparsity
	increases with score spread — after gradient descent the memory keys will
	be unit-scale. We use unit-norm memory to represent the operating condition
	(not the tiny 0.01-init default, which would produce near-uniform scores
	and thus lower sparsity by design).
	"""
	D, N = 64, 1024

	from hydra.engram import GPUEngram
	m = GPUEngram(d_model=D, n_columns=N)
	# Re-initialise memory to unit-norm scale — representative of trained weights.
	with torch.no_grad():
	m.memory.data = torch.nn.functional.normalize(
	torch.randn(N, D), dim=-1
	)
	m.eval()

	x = torch.randn(4, 32, D)
	token_ids = torch.randint(0, 500, (4, 32))

	# Replicate the retrieve path to inspect weights directly.
	with torch.no_grad():
	scores = x @ m.memory.T # (4, 32, N)
	try:
	from entmax import entmax15
	weights = entmax15(scores, dim=-1)
	except ImportError:
	# top-k softmax fallback: k=32, guaranteed ≥ 96.9% zeros at N=1024
	k = 32
	topk_vals, topk_idx = scores.topk(k, dim=-1)
	topk_w = torch.softmax(topk_vals, dim=-1)
	weights = torch.zeros_like(scores)
	weights.scatter_(-1, topk_idx, topk_w)

	zero_fraction = (weights == 0).float().mean().item()
	assert zero_fraction >= 0.95, (
	f"Expected >= 95% sparsity in attention weights, got {zero_fraction:.3f}"
	)


	# ---------------------------------------------------------------------------
	# test_no_nan_on_zero_input
	# ---------------------------------------------------------------------------

	def test_no_nan_on_zero_input():
	"""All-zero input must produce a finite output (no NaN/Inf from entmax)."""
	D, N = 64, 256
	m = _make_engram(d_model=D, n_columns=N)
	m.eval()

	x = torch.zeros(1, 8, D)
	token_ids = torch.zeros(1, 8, dtype=torch.long)
	out, hit_rate = m(x, token_ids)

	assert torch.isfinite(out).all(), "Output contains NaN or Inf on zero input"
	assert torch.isfinite(hit_rate), "hit_rate is NaN or Inf on zero input"


	# ---------------------------------------------------------------------------
	# test_scales_to_32k
	# ---------------------------------------------------------------------------

	def test_scales_to_32k():
	"""n_columns=32768 must run on CPU without OOM and return correct shape."""
	D, N = 128, 32768
	from hydra.engram import GPUEngram
	m = GPUEngram(d_model=D, n_columns=N)
	m.eval()

	x = torch.randn(1, 64, D)
	token_ids = torch.randint(0, 1000, (1, 64))
	out, hit_rate = m(x, token_ids)

	assert out.shape == (1, 64, D), f"Expected (1, 64, {D}), got {out.shape}"
	assert torch.isfinite(out).all(), "Output contains NaN/Inf at n_columns=32768"


	# ---------------------------------------------------------------------------
	# Bonus: hebbian_boost=False (default) does NOT update memory.data during train
	# ---------------------------------------------------------------------------

	def test_hebbian_off_by_default():
	"""With default hebbian_boost=False, memory.data is unchanged after train forward."""
	D, N = 32, 64
	m = _make_engram(d_model=D, n_columns=N, hebbian_boost=False)
	m.train()

	before = m.memory.data.clone()
	x = torch.randn(2, 4, D)
	token_ids = torch.randint(0, 50, (2, 4))
	m(x, token_ids)
	after = m.memory.data

	assert torch.equal(before, after), (
	"memory.data was mutated during forward but hebbian_boost=False"
	)


	def test_hebbian_on_updates_memory():
	"""With hebbian_boost=True, memory.data changes after train forward."""
	D, N = 32, 64
	from hydra.engram import GPUEngram
	m = GPUEngram(d_model=D, n_columns=N, hebbian_boost=True)
	m.train()

	before = m.memory.data.clone()
	x = torch.randn(2, 4, D)
	token_ids = torch.randint(0, 50, (2, 4))
	m(x, token_ids)
	after = m.memory.data

	assert not torch.equal(before, after), (
	"memory.data was NOT mutated during forward but hebbian_boost=True"
	)