stanno / stanno.py
oldman-dev's picture
Up-to-date with original repo
8f0d906 verified
"""STANNO-style proof of concept
This module implements a very simple Self-Training Artificial Neural Network Object (STANNO)
loosely inspired by Thaler's description: two neural networks, one of which trains the other,
optionally folded into a single object.[cite:1][cite:3]
Design choices:
- TraineeNet: a small multilayer perceptron (MLP) that learns a supervised mapping.
- Trainer: training logic embedded inside STANNO using standard gradient descent.
Conceptually this plays the role of the "trainer" network described in the literature,
but here it is implemented as explicit code for simplicity.
Features included for experimentation:
- Supervised training on a toy dataset (e.g., y = sin(x)).
- "Dreaming": run the trained net on a fixed or random latent input with inputs partially
or totally "blinded" (set to zero or constant) to observe internal dynamics.
- Noise injection: add Gaussian noise with adjustable standard deviation to all weights,
to explore how output complexity changes with noise level (from "stupidity" to chaos).
- Lesioning: randomly zero out a fraction of weights to mimic progressive "death" of
connections and observe degradation ("tunnel vision").[cite:2]
The goal is not to reproduce the original spreadsheet implementation, but to give a
simple, hackable playground in modern Python/NumPy that you can extend (including
replacing the hard-coded trainer by a learned meta-network if desired).
"""
from __future__ import annotations
import numpy as np
from dataclasses import dataclass
from typing import Tuple, Callable
@dataclass
class TraineeNet:
"""Simple 2-layer MLP (input -> hidden -> output).
This is the network that will be trained by the STANNO object.
"""
input_dim: int
hidden_dim: int
output_dim: int
def __post_init__(self) -> None:
rng = np.random.default_rng()
# Xavier-like initialization
self.W1 = rng.normal(0.0, 1.0 / np.sqrt(self.input_dim), (self.input_dim, self.hidden_dim))
self.b1 = np.zeros((1, self.hidden_dim))
self.W2 = rng.normal(0.0, 1.0 / np.sqrt(self.hidden_dim), (self.hidden_dim, self.output_dim))
self.b2 = np.zeros((1, self.output_dim))
def parameters(self):
return [self.W1, self.b1, self.W2, self.b2]
def forward(self, x: np.ndarray) -> Tuple[np.ndarray, dict]:
"""Forward pass returning output and cache for backprop."""
z1 = x @ self.W1 + self.b1
a1 = np.tanh(z1)
z2 = a1 @ self.W2 + self.b2
y = z2 # regression; for classification you could add softmax
cache = {"x": x, "z1": z1, "a1": a1, "z2": z2}
return y, cache
def apply_parameter_noise(self, sigma: float, rng: np.random.Generator | None = None) -> None:
"""Add Gaussian noise with std sigma to all parameters in-place."""
if sigma <= 0:
return
if rng is None:
rng = np.random.default_rng()
for p in self.parameters():
p += rng.normal(0.0, sigma, p.shape)
def lesion(self, fraction: float, rng: np.random.Generator | None = None) -> None:
"""Randomly zero out a fraction of weights (simulated neuron/connection death).
fraction in [0, 1]. Only affects W1 and W2; biases remain.
"""
fraction = float(np.clip(fraction, 0.0, 1.0))
if fraction <= 0:
return
if rng is None:
rng = np.random.default_rng()
for W in (self.W1, self.W2):
mask = rng.random(W.shape) < fraction
W[mask] = 0.0
class STANNO:
"""Self-Training Neural Network Object (STANNO-style).
Encapsula:
- Una red entrenable (TraineeNet).
- Un algoritmo de entrenamiento interno (gradient descent) que actúa como
"trainer" y actualiza los pesos a partir de ejemplos.
Esto sigue el espíritu de los STANNO descritos por Thaler: un objeto que
contiene la red y su mecanismo de entrenamiento, con capacidad de seguir
aprendiendo en línea.[cite:1][cite:3]
"""
def __init__(
self,
input_dim: int,
hidden_dim: int,
output_dim: int,
learning_rate: float = 1e-2,
) -> None:
self.net = TraineeNet(input_dim, hidden_dim, output_dim)
self.learning_rate = learning_rate
# ---------------------- Core training logic ----------------------
def _loss_and_grads(self, x: np.ndarray, y_true: np.ndarray) -> Tuple[float, list]:
"""Compute MSE loss and gradients via backprop for one batch."""
y_pred, cache = self.net.forward(x)
# Mean squared error
diff = y_pred - y_true
loss = float(np.mean(diff ** 2))
# Backprop
batch_size = x.shape[0]
dL_dy = (2.0 / batch_size) * diff # dL/dy
# Layer 2
a1 = cache["a1"]
dL_dW2 = a1.T @ dL_dy
dL_db2 = np.sum(dL_dy, axis=0, keepdims=True)
# Through tanh
dz2 = dL_dy @ self.net.W2.T
da1 = dz2
dz1 = da1 * (1.0 - np.tanh(cache["z1"]) ** 2)
# Layer 1
x_batch = cache["x"]
dL_dW1 = x_batch.T @ dz1
dL_db1 = np.sum(dz1, axis=0, keepdims=True)
grads = [dL_dW1, dL_db1, dL_dW2, dL_db2]
return loss, grads
def trainer_step(self, x: np.ndarray, y_true: np.ndarray) -> float:
"""One training step of the internal trainer over a mini-batch.
Conceptualmente, esto es el "trainer network" que ajusta pesos del
TraineeNet. Aquí se implementa como gradiente descendente directo.
"""
loss, grads = self._loss_and_grads(x, y_true)
for param, grad in zip(self.net.parameters(), grads):
param -= self.learning_rate * grad
return loss
def fit(
self,
x: np.ndarray,
y: np.ndarray,
epochs: int = 1000,
batch_size: int = 32,
shuffle: bool = True,
callback: Callable[[int, float], None] | None = None,
) -> None:
"""Train on a dataset using internal trainer.
Args:
x: shape (N, input_dim)
y: shape (N, output_dim)
epochs: number of passes over the dataset
batch_size: mini-batch size
shuffle: whether to shuffle each epoch
callback: optional function(epoch, loss) for logging
"""
N = x.shape[0]
rng = np.random.default_rng()
for epoch in range(epochs):
idx = np.arange(N)
if shuffle:
rng.shuffle(idx)
x_shuf = x[idx]
y_shuf = y[idx]
losses = []
for start in range(0, N, batch_size):
end = start + batch_size
xb = x_shuf[start:end]
yb = y_shuf[start:end]
loss = self.trainer_step(xb, yb)
losses.append(loss)
mean_loss = float(np.mean(losses))
if callback is not None:
callback(epoch, mean_loss)
# ---------------------- Inference & "dreaming" ----------------------
def predict(self, x: np.ndarray) -> np.ndarray:
y, _ = self.net.forward(x)
return y
def dream(
self,
num_steps: int = 128,
input_seed: np.ndarray | None = None,
noise_sigma: float = 0.0,
blind_inputs: bool = False,
rng: np.random.Generator | None = None,
) -> np.ndarray:
"""Generate a sequence of outputs by driving the net with a simple or blind input.
Args:
num_steps: length of the sequence to generate.
input_seed: initial input vector; if None, uses zeros.
noise_sigma: amount of noise to add to weights *once* before dreaming.
blind_inputs: if True, inputs are forced to zero every step.
rng: optional RNG.
Returns:
Array of generated outputs of shape (num_steps, output_dim).
"""
if rng is None:
rng = np.random.default_rng()
# Work on a copy so as not to permanently corrupt the trained net
shadow = TraineeNet(self.net.input_dim, self.net.hidden_dim, self.net.output_dim)
shadow.W1 = self.net.W1.copy()
shadow.b1 = self.net.b1.copy()
shadow.W2 = self.net.W2.copy()
shadow.b2 = self.net.b2.copy()
shadow.apply_parameter_noise(noise_sigma, rng=rng)
if input_seed is None:
x = np.zeros((1, self.net.input_dim))
else:
x = input_seed.reshape(1, -1)
outputs = []
for _ in range(num_steps):
if blind_inputs:
x_step = np.zeros_like(x)
else:
x_step = x
y, _ = shadow.forward(x_step)
outputs.append(y.copy())
# Simple feedback: feed output (or part of él) as next input
# This makes the sequence sensitive to internal weights.
if self.net.output_dim == self.net.input_dim:
x = y
else:
# Project or tile to match input dim
x = np.repeat(y, self.net.input_dim // self.net.output_dim + 1, axis=1)[
:, : self.net.input_dim
]
return np.concatenate(outputs, axis=0)
# ---------------------- Demo utilities ----------------------
def make_sin_dataset(n_samples: int = 256) -> Tuple[np.ndarray, np.ndarray]:
"""Simple 1D regression dataset: y = sin(x) on [0, 2π]."""
rng = np.random.default_rng()
x = rng.uniform(0.0, 2.0 * np.pi, size=(n_samples, 1))
y = np.sin(x)
return x, y
def demo_train_and_dream() -> None:
"""Train a STANNO on sin(x) and then explore noise/lesion effects.
Run this function directly ("python stanno_poc.py") to see numeric output.
"""
x, y = make_sin_dataset(512)
stanno = STANNO(input_dim=1, hidden_dim=32, output_dim=1, learning_rate=5e-3)
print("Training STANNO on y = sin(x)...")
stanno.fit(
x,
y,
epochs=500,
batch_size=64,
callback=lambda e, l: print(f"Epoch {e:4d} loss={l:.5f}") if (e + 1) % 100 == 0 else None,
)
# Evaluate basic fit
xs = np.linspace(0, 2 * np.pi, 16).reshape(-1, 1)
preds = stanno.predict(xs)
print("
Sample predictions after training:")
for xi, yi, yi_hat in zip(xs.flatten(), np.sin(xs).flatten(), preds.flatten()):
print(f"x={xi:5.2f} sin(x)={yi: .3f} pred={yi_hat: .3f}")
# Dreaming with different noise levels
for sigma in [0.0, 0.05, 0.2, 0.5]:
seq = stanno.dream(num_steps=32, noise_sigma=sigma, blind_inputs=True)
print(f"
Dreaming with noise_sigma={sigma} (first 10 outputs):")
print(np.round(seq[:10].flatten(), 3))
# Lesion experiment
print("
Lesioning 70% of weights and evaluating error on test points...")
# Backup parameters
backup = [p.copy() for p in stanno.net.parameters()]
stanno.net.lesion(fraction=0.7)
preds_lesioned = stanno.predict(xs)
mse_lesioned = float(np.mean((preds_lesioned - np.sin(xs)) ** 2))
print(f"MSE after lesioning 70% of weights: {mse_lesioned:.4f}")
# Restore
for param, b in zip(stanno.net.parameters(), backup):
param[...] = b
if __name__ == "__main__":
demo_train_and_dream()