"""Tests for the from-scratch network. The important one is `test_gradient_check`: it verifies the hand-written backward passes against numerical finite-difference gradients. If the chain rule were wired up wrong, this test would catch it. This is the "build it AND prove it" guarantee. """ from __future__ import annotations import numpy as np from nn.layers import softmax_cross_entropy from nn.model import MLP, Adam def _loss_only(model: MLP, x, y) -> float: """Forward pass + loss, without touching gradients.""" logits = model.forward(x) loss, _ = softmax_cross_entropy(logits, y) return loss def test_softmax_cross_entropy_uniform(): # Uniform logits over C classes -> loss should equal ln(C). logits = np.zeros((4, 10)) y = np.array([0, 1, 2, 3]) loss, dlogits = softmax_cross_entropy(logits, y) assert abs(loss - np.log(10)) < 1e-6 assert dlogits.shape == logits.shape def test_forward_shape(): model = MLP(sizes=(784, 64, 32, 10), seed=1) x = np.random.default_rng(0).standard_normal((8, 784)).astype(np.float32) assert model.forward(x).shape == (8, 10) assert model.predict(x).shape == (8,) probs = model.probabilities(x) assert np.allclose(probs.sum(axis=1), 1.0, atol=1e-6) def test_gradient_check(): """Analytic gradients must match finite differences to high precision.""" rng = np.random.default_rng(42) model = MLP(sizes=(6, 5, 4, 3), seed=2) x = rng.standard_normal((4, 6)) y = rng.integers(0, 3, size=4) # Analytic gradients (snapshot them; FD will perturb params in place afterwards). model.loss_and_grad(x, y) analytic = [g.copy() for _, g in model.params_and_grads()] eps = 1e-5 for idx, (p, _) in enumerate(model.params_and_grads()): flat = p.ravel() ga = analytic[idx].ravel() # Check a handful of random coordinates per parameter tensor. coords = rng.choice(flat.size, size=min(5, flat.size), replace=False) for c in coords: orig = flat[c] flat[c] = orig + eps lp = _loss_only(model, x, y) flat[c] = orig - eps lm = _loss_only(model, x, y) flat[c] = orig numeric = (lp - lm) / (2 * eps) denom = max(1e-8, abs(numeric) + abs(ga[c])) rel_err = abs(numeric - ga[c]) / denom assert rel_err < 1e-4, f"grad mismatch at param {idx} coord {c}: {rel_err:.2e}" def test_overfit_tiny_batch(): """A tiny batch should be driven to near-zero loss — proves the loop learns.""" rng = np.random.default_rng(0) model = MLP(sizes=(20, 32, 16, 4), seed=0) opt = Adam(model, lr=1e-2) x = rng.standard_normal((8, 20)) y = rng.integers(0, 4, size=8) first = model.loss_and_grad(x, y) for _ in range(300): model.loss_and_grad(x, y) opt.step() last = _loss_only(model, x, y) assert last < 0.05, f"expected near-zero loss, got {last:.4f} (started {first:.4f})" assert (model.predict(x) == y).all()