| """Tests for GPUEngram Sparse Modern Hopfield retrieval path. |
| |
| Tests are written first (TDD) against the new matmul-based retrieval. |
| Run with: pytest tests/test_engram.py -v |
| """ |
| from __future__ import annotations |
|
|
| import math |
|
|
| import pytest |
| import torch |
| import torch.nn as nn |
|
|
|
|
| |
| |
| |
|
|
| def _make_engram(d_model: int = 64, n_columns: int = 1024, hebbian_boost: bool = False): |
| from hydra.engram import GPUEngram |
| m = GPUEngram(d_model=d_model, n_columns=n_columns, hebbian_boost=hebbian_boost) |
| m.eval() |
| return m |
|
|
|
|
| |
| |
| |
|
|
| def test_forward_shape(): |
| """Output tensor matches input shape; hit_rate is a scalar.""" |
| B, T, D = 2, 16, 64 |
| m = _make_engram(d_model=D, n_columns=1024) |
| x = torch.randn(B, T, D) |
| token_ids = torch.randint(0, 1000, (B, T)) |
| out, hit_rate = m(x, token_ids) |
| assert out.shape == (B, T, D), f"Expected ({B},{T},{D}), got {out.shape}" |
| assert hit_rate.ndim == 0, f"hit_rate should be scalar, got shape {hit_rate.shape}" |
|
|
|
|
| |
| |
| |
|
|
| def test_gradient_flow(): |
| """Backprop through the Hopfield matmul path must reach self.memory.grad. |
| |
| The old scatter-gather path used self.memory[indices] which DID produce |
| gradients only for indexed rows. The new path (scores = x @ memory.T then |
| weights @ memory) creates a full matmul, so every column gets a non-zero |
| gradient signal (on a random batch where all keys are attended to). |
| """ |
| D, N = 64, 128 |
| m = _make_engram(d_model=D, n_columns=N) |
| m.train() |
|
|
| x = torch.randn(2, 8, D, requires_grad=True) |
| token_ids = torch.randint(0, 100, (2, 8)) |
| out, _ = m(x, token_ids) |
| loss = out.sum() |
| loss.backward() |
|
|
| assert m.memory.grad is not None, "self.memory.grad must be non-None after backward" |
| assert m.memory.grad.abs().sum() > 0, "self.memory.grad must have non-zero entries" |
|
|
|
|
| |
| |
| |
|
|
| def test_sparsity(): |
| """At least 95% of alpha-entmax attention weights must be exactly zero. |
| |
| entmax-1.5 (alpha-entmax) produces truly sparse distributions. Sparsity |
| increases with score spread — after gradient descent the memory keys will |
| be unit-scale. We use unit-norm memory to represent the operating condition |
| (not the tiny 0.01-init default, which would produce near-uniform scores |
| and thus lower sparsity by design). |
| """ |
| D, N = 64, 1024 |
|
|
| from hydra.engram import GPUEngram |
| m = GPUEngram(d_model=D, n_columns=N) |
| |
| with torch.no_grad(): |
| m.memory.data = torch.nn.functional.normalize( |
| torch.randn(N, D), dim=-1 |
| ) |
| m.eval() |
|
|
| x = torch.randn(4, 32, D) |
| token_ids = torch.randint(0, 500, (4, 32)) |
|
|
| |
| with torch.no_grad(): |
| scores = x @ m.memory.T |
| try: |
| from entmax import entmax15 |
| weights = entmax15(scores, dim=-1) |
| except ImportError: |
| |
| k = 32 |
| topk_vals, topk_idx = scores.topk(k, dim=-1) |
| topk_w = torch.softmax(topk_vals, dim=-1) |
| weights = torch.zeros_like(scores) |
| weights.scatter_(-1, topk_idx, topk_w) |
|
|
| zero_fraction = (weights == 0).float().mean().item() |
| assert zero_fraction >= 0.95, ( |
| f"Expected >= 95% sparsity in attention weights, got {zero_fraction:.3f}" |
| ) |
|
|
|
|
| |
| |
| |
|
|
| def test_no_nan_on_zero_input(): |
| """All-zero input must produce a finite output (no NaN/Inf from entmax).""" |
| D, N = 64, 256 |
| m = _make_engram(d_model=D, n_columns=N) |
| m.eval() |
|
|
| x = torch.zeros(1, 8, D) |
| token_ids = torch.zeros(1, 8, dtype=torch.long) |
| out, hit_rate = m(x, token_ids) |
|
|
| assert torch.isfinite(out).all(), "Output contains NaN or Inf on zero input" |
| assert torch.isfinite(hit_rate), "hit_rate is NaN or Inf on zero input" |
|
|
|
|
| |
| |
| |
|
|
| def test_scales_to_32k(): |
| """n_columns=32768 must run on CPU without OOM and return correct shape.""" |
| D, N = 128, 32768 |
| from hydra.engram import GPUEngram |
| m = GPUEngram(d_model=D, n_columns=N) |
| m.eval() |
|
|
| x = torch.randn(1, 64, D) |
| token_ids = torch.randint(0, 1000, (1, 64)) |
| out, hit_rate = m(x, token_ids) |
|
|
| assert out.shape == (1, 64, D), f"Expected (1, 64, {D}), got {out.shape}" |
| assert torch.isfinite(out).all(), "Output contains NaN/Inf at n_columns=32768" |
|
|
|
|
| |
| |
| |
|
|
| def test_hebbian_off_by_default(): |
| """With default hebbian_boost=False, memory.data is unchanged after train forward.""" |
| D, N = 32, 64 |
| m = _make_engram(d_model=D, n_columns=N, hebbian_boost=False) |
| m.train() |
|
|
| before = m.memory.data.clone() |
| x = torch.randn(2, 4, D) |
| token_ids = torch.randint(0, 50, (2, 4)) |
| m(x, token_ids) |
| after = m.memory.data |
|
|
| assert torch.equal(before, after), ( |
| "memory.data was mutated during forward but hebbian_boost=False" |
| ) |
|
|
|
|
| def test_hebbian_on_updates_memory(): |
| """With hebbian_boost=True, memory.data changes after train forward.""" |
| D, N = 32, 64 |
| from hydra.engram import GPUEngram |
| m = GPUEngram(d_model=D, n_columns=N, hebbian_boost=True) |
| m.train() |
|
|
| before = m.memory.data.clone() |
| x = torch.randn(2, 4, D) |
| token_ids = torch.randint(0, 50, (2, 4)) |
| m(x, token_ids) |
| after = m.memory.data |
|
|
| assert not torch.equal(before, after), ( |
| "memory.data was NOT mutated during forward but hebbian_boost=True" |
| ) |
|
|