Spaces:
Sleeping
Sleeping
| """Layers with hand-written forward and backward passes. | |
| Each layer caches what it needs on the forward pass and returns input-gradients | |
| on the backward pass, plus parameter-gradients where it has parameters. There is | |
| no automatic differentiation here on purpose: the point is to show the chain rule | |
| applied by hand, then prove it correct with a numerical gradient check (see tests). | |
| """ | |
| from __future__ import annotations | |
| import numpy as np | |
| class Linear: | |
| """Fully-connected layer: y = x @ W + b.""" | |
| def __init__(self, n_in: int, n_out: int, rng: np.random.Generator): | |
| # He initialisation keeps activations from vanishing/exploding through ReLU. | |
| self.W = rng.standard_normal((n_in, n_out)) * np.sqrt(2.0 / n_in) | |
| self.b = np.zeros(n_out) | |
| self.dW = np.zeros_like(self.W) | |
| self.db = np.zeros_like(self.b) | |
| self._x = None | |
| def forward(self, x: np.ndarray) -> np.ndarray: | |
| self._x = x | |
| return x @ self.W + self.b | |
| def backward(self, dy: np.ndarray) -> np.ndarray: | |
| # dL/dW = x^T · dy ; dL/db = sum over the batch ; dL/dx = dy · W^T | |
| self.dW = self._x.T @ dy | |
| self.db = dy.sum(axis=0) | |
| return dy @ self.W.T | |
| def params_and_grads(self): | |
| yield self.W, self.dW | |
| yield self.b, self.db | |
| class ReLU: | |
| """Rectified linear unit: passes positives through, zeros the rest.""" | |
| def __init__(self): | |
| self._mask = None | |
| def forward(self, x: np.ndarray) -> np.ndarray: | |
| self._mask = x > 0 | |
| return x * self._mask | |
| def backward(self, dy: np.ndarray) -> np.ndarray: | |
| # Gradient flows only where the input was positive. | |
| return dy * self._mask | |
| def params_and_grads(self): | |
| return iter(()) | |
| def softmax_cross_entropy(logits: np.ndarray, labels: np.ndarray): | |
| """Combined softmax + cross-entropy. | |
| Returns (loss, dlogits). Done together for numerical stability: subtracting the | |
| row-max before exp avoids overflow, and the gradient simplifies to (p - y) / N. | |
| `labels` is an integer class index per row. | |
| """ | |
| shifted = logits - logits.max(axis=1, keepdims=True) | |
| exp = np.exp(shifted) | |
| probs = exp / exp.sum(axis=1, keepdims=True) | |
| n = logits.shape[0] | |
| log_likelihood = -np.log(probs[np.arange(n), labels] + 1e-12) | |
| loss = log_likelihood.mean() | |
| dlogits = probs.copy() | |
| dlogits[np.arange(n), labels] -= 1.0 | |
| dlogits /= n | |
| return loss, dlogits | |