Up-to-date with original repo
Browse files- .github/CODEOWNERS +3 -0
- .gitignore +47 -0
- HF_README.md +288 -0
- README.md +328 -3
- STANNO_IS_NOT.md +155 -0
- examples/anomaly_filter.json +33 -0
- examples/cascade_autoencoder.json +28 -0
- examples/sin_regression.json +39 -0
- examples/sin_regression.stanno.pkl +3 -0
- pyproject.toml +31 -0
- requirements.txt +11 -0
- scripts/generate_clip_embeddings.py +92 -0
- scripts/train_stanno_on_embeddings.py +112 -0
- stanno.py +314 -0
- stanno/__init__.py +24 -0
- stanno/__main__.py +2 -0
- stanno/cli.py +398 -0
- stanno/config/__init__.py +0 -0
- stanno/config/schema.py +125 -0
- stanno/core/__init__.py +0 -0
- stanno/core/backend.py +162 -0
- stanno/core/stanno.py +317 -0
- stanno/core/trainee.py +174 -0
- stanno/core/trainer.py +126 -0
- stanno/data/__init__.py +0 -0
- stanno/data/base.py +82 -0
- stanno/data/csv_loader.py +56 -0
- stanno/data/json_loader.py +98 -0
- stanno/data/numpy_loader.py +112 -0
- stanno/integration/__init__.py +0 -0
- stanno/integration/cascade.py +354 -0
- stanno/integration/continual.py +109 -0
- stanno/integration/dsanno.py +389 -0
- stanno/integration/filter.py +158 -0
- stanno/integration/llm_client.py +123 -0
- stanno/trainers/__init__.py +0 -0
- stanno/trainers/evolutionary.py +189 -0
- stanno/trainers/fixed.py +180 -0
- stanno/trainers/local_rule.py +259 -0
- stanno_poc.py +314 -0
.github/CODEOWNERS
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Basic approvals
|
| 2 |
+
* @nitroxido
|
| 3 |
+
/nodes.py @nitroxido
|
.gitignore
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
*.so
|
| 6 |
+
*.egg
|
| 7 |
+
*.egg-info/
|
| 8 |
+
dist/
|
| 9 |
+
build/
|
| 10 |
+
.Python
|
| 11 |
+
|
| 12 |
+
# Markdown files (but keep documentation)
|
| 13 |
+
*.md
|
| 14 |
+
!README.md
|
| 15 |
+
!HF_README.md
|
| 16 |
+
!STANNO_IS_NOT.md
|
| 17 |
+
|
| 18 |
+
# Virtual environments
|
| 19 |
+
venv/
|
| 20 |
+
env/
|
| 21 |
+
ENV/
|
| 22 |
+
.venv
|
| 23 |
+
|
| 24 |
+
# IDEs
|
| 25 |
+
.vscode/
|
| 26 |
+
.idea/
|
| 27 |
+
*.swp
|
| 28 |
+
*.swo
|
| 29 |
+
*~
|
| 30 |
+
.DS_Store
|
| 31 |
+
|
| 32 |
+
# Testing
|
| 33 |
+
.pytest_cache/
|
| 34 |
+
.coverage
|
| 35 |
+
htmlcov/
|
| 36 |
+
|
| 37 |
+
# Cache
|
| 38 |
+
.cache/
|
| 39 |
+
.mypy_cache/
|
| 40 |
+
|
| 41 |
+
# Environment variables
|
| 42 |
+
.env
|
| 43 |
+
.env.local
|
| 44 |
+
|
| 45 |
+
# Temporary files
|
| 46 |
+
*.tmp
|
| 47 |
+
*.log
|
HF_README.md
ADDED
|
@@ -0,0 +1,288 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# STANNO — Neural Networks That Train Neural Networks
|
| 2 |
+
|
| 3 |
+
A modern, open-source Python library implementing the **Artificial Neurogenesis Network** concept from US Patent 5,852,815 (Thaler, 1998). One network (the trainer) decides how another network (the trainee) should update its weights — no backpropagation needed. Multiple STANNOs can be chained into cascade pipelines, and any trained STANNO can be turned into a data scanner that finds matching rows in large datasets.
|
| 4 |
+
|
| 5 |
+
> **Attribution**: This is a faithful, open-source implementation of Thaler's patented design with modern extensions (cascading, data scanning, ComfyUI integration). The original patent has expired. All core concepts are credited to the original patent.
|
| 6 |
+
|
| 7 |
+
## ⚠️ What STANNO Is (and Isn't)
|
| 8 |
+
|
| 9 |
+
**STANNO is specialized**, not a drop-in replacement for PyTorch.
|
| 10 |
+
|
| 11 |
+
**Good for:**
|
| 12 |
+
- Anomaly detection (reconstruction-based scoring)
|
| 13 |
+
- Online/continual learning (one-sample-at-a-time updates)
|
| 14 |
+
- Interpretable weight modification (see exactly what changes)
|
| 15 |
+
- Multi-stage cascade pipelines (encoder → bottleneck → decoder, end-to-end)
|
| 16 |
+
- Semantic data scanning (find rows in a large dataset that match learned distribution)
|
| 17 |
+
- ComfyUI creative workflows (style transfer via dream mode)
|
| 18 |
+
|
| 19 |
+
**NOT for:**
|
| 20 |
+
- General regression (accuracy ~0.4, use PyTorch instead)
|
| 21 |
+
- Image generation alone (need Stable Diffusion + nodes)
|
| 22 |
+
- High-throughput training (slow NumPy)
|
| 23 |
+
|
| 24 |
+
For details, see [STANNO_IS_NOT.md](./STANNO_IS_NOT.md).
|
| 25 |
+
|
| 26 |
+
**What you can do with this:**
|
| 27 |
+
|
| 28 |
+
**Train networks on your data:**
|
| 29 |
+
```python
|
| 30 |
+
from stanno import STANNO
|
| 31 |
+
from stanno.config.schema import STANNOConfig
|
| 32 |
+
import numpy as np
|
| 33 |
+
|
| 34 |
+
config = STANNOConfig(layers=[784, 256, 10])
|
| 35 |
+
stanno = STANNO(config)
|
| 36 |
+
stanno.fit(x_train, y_train, epochs=100)
|
| 37 |
+
predictions = stanno.predict(x_test)
|
| 38 |
+
```
|
| 39 |
+
|
| 40 |
+
**Chain into cascade pipelines:**
|
| 41 |
+
```python
|
| 42 |
+
from stanno import STANNO, STANNOConfig, CascadeSTANNO
|
| 43 |
+
|
| 44 |
+
# Encoder-decoder autoencoder
|
| 45 |
+
enc = STANNO(STANNOConfig(layers=[768, 256, 64], learning_rate=0.05))
|
| 46 |
+
dec = STANNO(STANNOConfig(layers=[64, 256, 768], learning_rate=0.05))
|
| 47 |
+
|
| 48 |
+
ae = CascadeSTANNO([enc, dec])
|
| 49 |
+
ae.fit(embeddings, embeddings, epochs=200) # end-to-end gradient cascade
|
| 50 |
+
|
| 51 |
+
# Extract compressed representations
|
| 52 |
+
codes = ae.intermediate_output(embeddings, stage=0) # (N, 64)
|
| 53 |
+
|
| 54 |
+
# Freeze the encoder, continue adapting the decoder
|
| 55 |
+
ae.freeze(0)
|
| 56 |
+
ae.fit(new_domain_embeddings, new_domain_embeddings, epochs=100)
|
| 57 |
+
```
|
| 58 |
+
|
| 59 |
+
**Scan large datasets for matching rows (DSANNO):**
|
| 60 |
+
```python
|
| 61 |
+
from stanno import STANNO, STANNOConfig, DSANNO
|
| 62 |
+
|
| 63 |
+
# Train on known-good data
|
| 64 |
+
detector = STANNO(STANNOConfig(layers=[64, 128, 64], learning_rate=0.05))
|
| 65 |
+
detector.fit(normal_data, normal_data, epochs=200)
|
| 66 |
+
|
| 67 |
+
scanner = DSANNO(detector, mode="reconstruction")
|
| 68 |
+
|
| 69 |
+
# Auto-calibrate threshold from training distribution
|
| 70 |
+
threshold = scanner.calibrate_threshold(normal_data, percentile=95)
|
| 71 |
+
|
| 72 |
+
# Find matching rows in a large corpus
|
| 73 |
+
result = scanner.scan(large_corpus, threshold=threshold)
|
| 74 |
+
matching = large_corpus[result.matched_indices()]
|
| 75 |
+
|
| 76 |
+
# Or retrieve the top-k best matches
|
| 77 |
+
indices, scores, _ = scanner.top_k(large_corpus, k=20)
|
| 78 |
+
|
| 79 |
+
# Stream huge files without loading all at once
|
| 80 |
+
for batch_result in scanner.scan_stream(file_batches, threshold=threshold):
|
| 81 |
+
process(batch_result.matched_indices())
|
| 82 |
+
```
|
| 83 |
+
|
| 84 |
+
**Detect when inputs are unusual (anomaly filter):**
|
| 85 |
+
```python
|
| 86 |
+
from stanno.integration.filter import STANNOFilter
|
| 87 |
+
|
| 88 |
+
# Train on normal data
|
| 89 |
+
stanno.fit(normal_data, normal_data, epochs=50)
|
| 90 |
+
|
| 91 |
+
# Score new input
|
| 92 |
+
score, metadata = stanno_filter.score(new_input)
|
| 93 |
+
# score ranges [0, 1]: low = normal, high = anomaly
|
| 94 |
+
```
|
| 95 |
+
|
| 96 |
+
**Generate variations via "dream mode":**
|
| 97 |
+
```python
|
| 98 |
+
# Start with a seed input, add noise, generate a sequence
|
| 99 |
+
dream_sequence = stanno.dream(
|
| 100 |
+
num_steps=64,
|
| 101 |
+
input_seed=seed_vector,
|
| 102 |
+
noise_sigma=0.1 # controls creativity
|
| 103 |
+
)
|
| 104 |
+
```
|
| 105 |
+
|
| 106 |
+
**Use in ComfyUI workflows (9 nodes):**
|
| 107 |
+
- Load/create STANNO models
|
| 108 |
+
- Train on image batches
|
| 109 |
+
- Score/filter images
|
| 110 |
+
- Inject dream creativity into CLIP conditioning
|
| 111 |
+
- Apply dream output as LoRA-style patches
|
| 112 |
+
- Route images by style match
|
| 113 |
+
- Scan image batches for best matches with auto-calibrated thresholds
|
| 114 |
+
- Build multi-stage cascade autoencoders
|
| 115 |
+
|
| 116 |
+
## Why use STANNO?
|
| 117 |
+
|
| 118 |
+
- **Interpretable**: You can see exactly what the trainer does to weights. No black-box backprop.
|
| 119 |
+
- **Flexible**: Three trainer types (Fixed, LocalRule, Evolutionary) fit different problems.
|
| 120 |
+
- **Learnable**: The trainer itself can adapt (meta-learning).
|
| 121 |
+
- **Cascadable**: Chain STANNOs into multi-stage pipelines with end-to-end gradient flow across stages.
|
| 122 |
+
- **Scannable**: Turn any trained STANNO into a semantic scanner over large datasets.
|
| 123 |
+
- **No autodiff**: Works with NumPy. No GPU required (but supports PyTorch if you have it).
|
| 124 |
+
- **ComfyUI ready**: Nine custom nodes for image generation workflows.
|
| 125 |
+
|
| 126 |
+
## Install
|
| 127 |
+
|
| 128 |
+
```bash
|
| 129 |
+
pip install git+https://github.com/nitroxido/stanno.git
|
| 130 |
+
```
|
| 131 |
+
|
| 132 |
+
## Quick examples
|
| 133 |
+
|
| 134 |
+
### Regression on sin(x)
|
| 135 |
+
```bash
|
| 136 |
+
python -m stanno train --config examples/sin_regression.json
|
| 137 |
+
python -m stanno predict --config examples/sin_regression.json --input 0.5
|
| 138 |
+
python -m stanno dream --config examples/sin_regression.json
|
| 139 |
+
```
|
| 140 |
+
|
| 141 |
+
### Autoencoder on images
|
| 142 |
+
```python
|
| 143 |
+
from stanno import STANNO
|
| 144 |
+
from stanno.config.schema import STANNOConfig
|
| 145 |
+
import numpy as np
|
| 146 |
+
|
| 147 |
+
# Reshape images to flat vectors (B, H*W*C)
|
| 148 |
+
x = images.reshape(images.shape[0], -1).astype('float32')
|
| 149 |
+
|
| 150 |
+
# Autoencoder: input and output have same size
|
| 151 |
+
config = STANNOConfig(layers=[x.shape[1], 256, x.shape[1]])
|
| 152 |
+
stanno = STANNO(config)
|
| 153 |
+
stanno.fit(x, x, epochs=100, batch_size=32)
|
| 154 |
+
|
| 155 |
+
# Get reconstruction
|
| 156 |
+
x_reconstructed = stanno.predict(x[:10])
|
| 157 |
+
```
|
| 158 |
+
|
| 159 |
+
### Online learning (continual)
|
| 160 |
+
```python
|
| 161 |
+
from stanno.integration.continual import ContinualSTANNO
|
| 162 |
+
|
| 163 |
+
cont = ContinualSTANNO(stanno)
|
| 164 |
+
|
| 165 |
+
for sample, label in data_stream:
|
| 166 |
+
loss = cont.observe(sample, label)
|
| 167 |
+
if cont.steps % 100 == 0:
|
| 168 |
+
test_loss = cont.test_loss(x_test, y_test)
|
| 169 |
+
print(f"Step {cont.steps}: train_loss={loss:.4f}, test_loss={test_loss:.4f}")
|
| 170 |
+
```
|
| 171 |
+
|
| 172 |
+
### Anomaly scoring
|
| 173 |
+
```python
|
| 174 |
+
from stanno.config.schema import FilterConfig
|
| 175 |
+
from stanno.integration.filter import STANNOFilter
|
| 176 |
+
|
| 177 |
+
# Train on normal embeddings
|
| 178 |
+
stanno.fit(normal_embeddings, normal_embeddings, epochs=50)
|
| 179 |
+
|
| 180 |
+
# Create filter
|
| 181 |
+
filt = STANNOFilter(stanno, FilterConfig(anomaly_threshold=0.7))
|
| 182 |
+
|
| 183 |
+
# Score new embedding
|
| 184 |
+
score, info = filt.score(new_embedding)
|
| 185 |
+
print(f"Anomaly score: {score:.3f} (0=normal, 1=anomaly)")
|
| 186 |
+
if info["blocked"]:
|
| 187 |
+
print("Blocked: input is too unusual")
|
| 188 |
+
```
|
| 189 |
+
|
| 190 |
+
## How it works
|
| 191 |
+
|
| 192 |
+
**The core idea:**
|
| 193 |
+
- **TraineeNet**: A neural network with weights you want to train.
|
| 194 |
+
- **TrainerNet**: Another network that looks at the TraineeNet's internal state (activations, errors, weights) and computes how to update those weights.
|
| 195 |
+
- **No backprop**: The update formula is explicit, not learned via autodiff.
|
| 196 |
+
- **Cascades**: Multiple TraineeNet+TrainerNet pairs can be chained so that gradient signals flow backward across stage boundaries, enabling end-to-end training of multi-stage pipelines.
|
| 197 |
+
- **Scanning**: Any trained STANNO can be used as a similarity function to scan and rank rows in large datasets by how closely they match the learned distribution.
|
| 198 |
+
|
| 199 |
+
**The three trainer types:**
|
| 200 |
+
|
| 201 |
+
| Type | Mechanism | Best for |
|
| 202 |
+
|------|-----------|----------|
|
| 203 |
+
| **Fixed** | 4-module design (patent 5852815A), cascade-aware | Baseline, reproducibility, understanding the concept |
|
| 204 |
+
| **LocalRule** | Shared MLP per synapse | Adaptive training, interpretability |
|
| 205 |
+
| **Evolutionary** | Evolve per-layer scales (ES) | Unconventional problems, when autodiff fails |
|
| 206 |
+
|
| 207 |
+
## Technical details
|
| 208 |
+
|
| 209 |
+
- **Backend agnostic**: Uses NumPy by default, but can swap in PyTorch.
|
| 210 |
+
- **Variable architecture**: Networks can be any depth (list of layer sizes).
|
| 211 |
+
- **Configurable feedback**: Dream mode can "repeat" outputs, use a learned "linear" projection, or "zero" them.
|
| 212 |
+
- **Pickle-serializable**: Save/load trained models easily.
|
| 213 |
+
|
| 214 |
+
## Benchmark
|
| 215 |
+
|
| 216 |
+
On sin(x) regression (512 samples, 100 epochs):
|
| 217 |
+
|
| 218 |
+
```
|
| 219 |
+
Fixed MSE=0.047
|
| 220 |
+
LocalRule MSE=0.021 (learnable rules = better fit)
|
| 221 |
+
Evolutionary MSE=0.053
|
| 222 |
+
```
|
| 223 |
+
|
| 224 |
+
## For ComfyUI users
|
| 225 |
+
|
| 226 |
+
The [comfyui-stanno](https://github.com/[your-username]/comfyui-stanno) custom node package provides nine nodes in the **STANNO** category:
|
| 227 |
+
|
| 228 |
+
| Node | What it does |
|
| 229 |
+
|------|--------------|
|
| 230 |
+
| **STANNOLoad** | Create or load a model (JSON config or .pkl file) |
|
| 231 |
+
| **STANNOTrainImages** | Train on image batches |
|
| 232 |
+
| **STANNOScoreImages** | Filter images by reconstruction error |
|
| 233 |
+
| **STANNODreamCond** | Modify CLIP embeddings with dream mode |
|
| 234 |
+
| **STANNODynamicLoRA** | Apply learned style as LoRA patches |
|
| 235 |
+
| **STANNOCompositeCheck** | Route images to whichever of two STANNOs matches best |
|
| 236 |
+
| **STANNOScan** | DSANNO scanner: auto-calibrated threshold + top-k image retrieval |
|
| 237 |
+
| **STANNOCascadeLoad** | Create or load a multi-stage CascadeSTANNO |
|
| 238 |
+
| **STANNOCascadeTrainImages** | Train a cascade end-to-end on an image batch |
|
| 239 |
+
|
| 240 |
+
Install via ComfyUI-Manager or manually.
|
| 241 |
+
|
| 242 |
+
## Patent & Attribution
|
| 243 |
+
|
| 244 |
+
**STANNO is an open-source implementation of US Patent 5,852,815** (*Artificial Neurogenesis Network*), filed by Stephen L. Thaler. The patent has expired (US utility patents: 20 years from filing). We fully acknowledge and credit all core architectural concepts to the original patent.
|
| 245 |
+
|
| 246 |
+
**This implementation adds:**
|
| 247 |
+
- Modern Python/NumPy/PyTorch backend
|
| 248 |
+
- CascadeSTANNO (multi-stage gradient cascade)
|
| 249 |
+
- DSANNO (data scanning and semantic search)
|
| 250 |
+
- Three trainer types (Fixed, LocalRule, Evolutionary)
|
| 251 |
+
- ComfyUI integration (9 custom nodes)
|
| 252 |
+
- CLI tools for common tasks
|
| 253 |
+
|
| 254 |
+
See **Citation** below for how to cite the original patent and this implementation.
|
| 255 |
+
|
| 256 |
+
## Citation
|
| 257 |
+
|
| 258 |
+
If you use STANNO in research, cite the original patent:
|
| 259 |
+
|
| 260 |
+
```bibtex
|
| 261 |
+
@patent{thaler1998artificial,
|
| 262 |
+
title={Artificial neurogenesis network},
|
| 263 |
+
author={Thaler, Stephen L},
|
| 264 |
+
year={1998},
|
| 265 |
+
number={5852815},
|
| 266 |
+
institution={United States Patent}
|
| 267 |
+
}
|
| 268 |
+
```
|
| 269 |
+
|
| 270 |
+
And mention this implementation:
|
| 271 |
+
```bibtex
|
| 272 |
+
@software{stanno2026,
|
| 273 |
+
title={STANNO: Self-Training Artificial Neural Network Object},
|
| 274 |
+
author={Raides J. Rodríguez},
|
| 275 |
+
year={2026},
|
| 276 |
+
url={https://github.com/nitroxido/stanno}
|
| 277 |
+
}
|
| 278 |
+
```
|
| 279 |
+
|
| 280 |
+
## Questions?
|
| 281 |
+
|
| 282 |
+
- **Bug report**: Open an issue on GitHub
|
| 283 |
+
- **Question**: Start a discussion
|
| 284 |
+
- **Feature request**: Describe what you want to build
|
| 285 |
+
|
| 286 |
+
## License
|
| 287 |
+
|
| 288 |
+
MIT
|
README.md
CHANGED
|
@@ -1,3 +1,328 @@
|
|
| 1 |
-
-
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# STANNO — Self-Training Artificial Neural Network Object
|
| 2 |
+
|
| 3 |
+
A neural network that trains another neural network. No backpropagation. Directly modifies weights.
|
| 4 |
+
|
| 5 |
+
## What is this?
|
| 6 |
+
|
| 7 |
+
STANNO is a modern, open-source implementation of the **Artificial Neurogenesis Network** concept from US Patent 5,852,815 (Thaler, 1998). Instead of using backpropagation to update weights, one network (the **TrainerNet**) computes weight updates for another network (the **TraineeNet**) by analyzing its internal state.
|
| 8 |
+
|
| 9 |
+
Think of it as: you have a student network that learns, and a teacher network that decides how the student's weights should change — without autodiff, without gradients.
|
| 10 |
+
|
| 11 |
+
Multiple STANNOs can be **cascaded** into pipelines where the output of one feeds the next, trained end-to-end with gradient flow across stage boundaries. A **DSANNO** (Data Scanning variant) wraps any trained STANNO and scans large datasets to find rows that match its learned representation — the inverse of anomaly detection.
|
| 12 |
+
|
| 13 |
+
### Patent & Attribution
|
| 14 |
+
|
| 15 |
+
This codebase is an implementation of the architecture described in **US Patent 5,852,815** (*Artificial Neurogenesis Network*) filed by Stephen L. Thaler. The original patent has expired (US utility patents run 20 years from filing date). This open-source implementation builds upon the original design with modern extensions: **CascadeSTANNO** (multi-stage gradient flow), **DSANNO** (data scanning), and integration with contemporary frameworks (PyTorch, ComfyUI).
|
| 16 |
+
|
| 17 |
+
We acknowledge and attribute all core concepts to Thaler's patent. See [Patents & Reference](#papers--reference) below for full citation details.
|
| 18 |
+
|
| 19 |
+
## ⚠️ Before you start
|
| 20 |
+
|
| 21 |
+
**STANNO is specialized**, not a general-purpose neural network. It's designed for:
|
| 22 |
+
- Anomaly detection ✓
|
| 23 |
+
- Online learning ✓
|
| 24 |
+
- Interpretability ✓
|
| 25 |
+
|
| 26 |
+
It's **not** for:
|
| 27 |
+
- Regression (use PyTorch/TensorFlow instead)
|
| 28 |
+
- Image generation alone (use with ComfyUI + SD 1.5)
|
| 29 |
+
- High-accuracy function fitting
|
| 30 |
+
|
| 31 |
+
See [STANNO_IS_NOT.md](STANNO_IS_NOT.md) for details.
|
| 32 |
+
|
| 33 |
+
## Why would I use this?
|
| 34 |
+
|
| 35 |
+
- **Direct weight modification**: The trainer has explicit control over what happens to each synapse. Useful for interpretability, debugging, or unconventional training schemes.
|
| 36 |
+
- **Meta-learning friendly**: The trainer itself can be learned (via evolution or other methods). Different tasks can teach the trainer how to train.
|
| 37 |
+
- **Composable**: Three trainer implementations (Fixed, LocalRule, Evolutionary) let you pick the right tool.
|
| 38 |
+
- **Cascadable**: Chain multiple STANNOs into encoder-decoder pipelines or progressive compression networks. Freeze individual stages, adapt others — all in the same object.
|
| 39 |
+
- **Data scanning**: DSANNO turns any trained STANNO into a semantic scanner. Find the rows in a large dataset that most closely match the network's learned distribution, with auto-calibrated thresholds and top-k retrieval.
|
| 40 |
+
- **ComfyUI integration**: Nine custom nodes for image generation workflows.
|
| 41 |
+
- **Works with LLMs**: Filter or augment LLM inputs/outputs using STANNO's anomaly detection.
|
| 42 |
+
|
| 43 |
+
## Install
|
| 44 |
+
|
| 45 |
+
```bash
|
| 46 |
+
pip install git+https://github.com/nitroxido/stanno.git
|
| 47 |
+
```
|
| 48 |
+
|
| 49 |
+
Or clone and install locally:
|
| 50 |
+
```bash
|
| 51 |
+
git clone https://github.com/nitroxido/stanno.git
|
| 52 |
+
cd stanno
|
| 53 |
+
pip install -e .
|
| 54 |
+
```
|
| 55 |
+
|
| 56 |
+
For ComfyUI, the nodes auto-install via ComfyUI-Manager, or manually:
|
| 57 |
+
```bash
|
| 58 |
+
cd ComfyUI/custom_nodes
|
| 59 |
+
git clone https://github.com/nitroxido/comfyui-stanno.git
|
| 60 |
+
cd comfyui-stanno
|
| 61 |
+
pip install -r requirements.txt
|
| 62 |
+
```
|
| 63 |
+
|
| 64 |
+
## Quick Start
|
| 65 |
+
|
| 66 |
+
### Train on sin(x)
|
| 67 |
+
|
| 68 |
+
```python
|
| 69 |
+
import numpy as np
|
| 70 |
+
from stanno import STANNO
|
| 71 |
+
from stanno.config.schema import STANNOConfig
|
| 72 |
+
|
| 73 |
+
# Config
|
| 74 |
+
config = STANNOConfig(
|
| 75 |
+
layers=[1, 32, 1],
|
| 76 |
+
trainer_type="fixed",
|
| 77 |
+
learning_rate=0.005,
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
# Data
|
| 81 |
+
x = np.linspace(0, 1, 512, dtype=np.float32).reshape(-1, 1)
|
| 82 |
+
y = np.sin(2 * np.pi * x).astype(np.float32)
|
| 83 |
+
|
| 84 |
+
# Train
|
| 85 |
+
stanno = STANNO(config)
|
| 86 |
+
stanno.fit(x, y, epochs=500, batch_size=64)
|
| 87 |
+
|
| 88 |
+
# Predict
|
| 89 |
+
y_pred = stanno.predict(np.array([[0.25]], dtype=np.float32))
|
| 90 |
+
print(f"sin(0.25) ≈ {y_pred[0, 0]:.3f}") # ≈ 0.587
|
| 91 |
+
```
|
| 92 |
+
|
| 93 |
+
### Load from config file
|
| 94 |
+
|
| 95 |
+
```bash
|
| 96 |
+
python -m stanno train --config examples/sin_regression.json
|
| 97 |
+
python -m stanno predict --config examples/sin_regression.json --input 0.25
|
| 98 |
+
python -m stanno dream --config examples/sin_regression.json
|
| 99 |
+
```
|
| 100 |
+
|
| 101 |
+
### Anomaly filtering (pre-filter for LLM)
|
| 102 |
+
|
| 103 |
+
```python
|
| 104 |
+
from stanno import STANNO
|
| 105 |
+
from stanno.config.schema import FilterConfig
|
| 106 |
+
from stanno.integration.filter import STANNOFilter
|
| 107 |
+
from stanno.integration.llm_client import LLMClient
|
| 108 |
+
|
| 109 |
+
# Train STANNO on normal embeddings
|
| 110 |
+
stanno = STANNO(...)
|
| 111 |
+
stanno.fit(normal_embeddings, normal_embeddings, epochs=100)
|
| 112 |
+
|
| 113 |
+
# Set up filter
|
| 114 |
+
filter_config = FilterConfig(anomaly_threshold=0.7, block_above_threshold=True)
|
| 115 |
+
llm = LLMClient(llm_config)
|
| 116 |
+
filt = STANNOFilter(stanno, filter_config, llm)
|
| 117 |
+
|
| 118 |
+
# Score incoming prompt
|
| 119 |
+
score, meta = filt.score(embedding)
|
| 120 |
+
if not meta["blocked"]:
|
| 121 |
+
response = filt.filter_and_send(messages, embedding)
|
| 122 |
+
```
|
| 123 |
+
|
| 124 |
+
### Online learning
|
| 125 |
+
|
| 126 |
+
```python
|
| 127 |
+
from stanno.integration.continual import ContinualSTANNO
|
| 128 |
+
|
| 129 |
+
cont = ContinualSTANNO(stanno)
|
| 130 |
+
|
| 131 |
+
# One sample at a time
|
| 132 |
+
for x_i, y_i in stream:
|
| 133 |
+
loss = cont.observe(x_i, y_i)
|
| 134 |
+
print(f"Step {cont.steps}: loss={loss:.4f}")
|
| 135 |
+
|
| 136 |
+
# Check held-out test set
|
| 137 |
+
test_loss = cont.test_loss(x_test, y_test)
|
| 138 |
+
print(f"Test loss: {test_loss:.4f}")
|
| 139 |
+
```
|
| 140 |
+
|
| 141 |
+
### Cascading (encoder → decoder pipeline)
|
| 142 |
+
|
| 143 |
+
```python
|
| 144 |
+
from stanno import STANNO, STANNOConfig, CascadeSTANNO
|
| 145 |
+
|
| 146 |
+
# Two-stage autoencoder: compress 768-D embeddings to 64-D
|
| 147 |
+
enc = STANNO(STANNOConfig(layers=[768, 256, 64], learning_rate=0.05))
|
| 148 |
+
dec = STANNO(STANNOConfig(layers=[64, 256, 768], learning_rate=0.05))
|
| 149 |
+
|
| 150 |
+
ae = CascadeSTANNO([enc, dec])
|
| 151 |
+
ae.fit(embeddings, embeddings, epochs=200, batch_size=32)
|
| 152 |
+
|
| 153 |
+
# Get compressed representation
|
| 154 |
+
codes = ae.intermediate_output(embeddings, stage=0) # (N, 64)
|
| 155 |
+
|
| 156 |
+
# Freeze encoder, continue training decoder
|
| 157 |
+
ae.freeze(0)
|
| 158 |
+
ae.fit(embeddings, embeddings, epochs=100) # only decoder updates
|
| 159 |
+
```
|
| 160 |
+
|
| 161 |
+
CLI equivalent:
|
| 162 |
+
```bash
|
| 163 |
+
python -m stanno cascade --config examples/cascade_autoencoder.json
|
| 164 |
+
```
|
| 165 |
+
|
| 166 |
+
### Data scanning (DSANNO)
|
| 167 |
+
|
| 168 |
+
```python
|
| 169 |
+
from stanno import STANNO, STANNOConfig, DSANNO
|
| 170 |
+
|
| 171 |
+
# Train on known-good data
|
| 172 |
+
detector = STANNO(STANNOConfig(layers=[64, 128, 64], learning_rate=0.05))
|
| 173 |
+
detector.fit(normal_data, normal_data, epochs=200)
|
| 174 |
+
|
| 175 |
+
scanner = DSANNO(detector, mode="reconstruction")
|
| 176 |
+
|
| 177 |
+
# Auto-calibrate threshold from training distribution
|
| 178 |
+
threshold = scanner.calibrate_threshold(normal_data, percentile=95)
|
| 179 |
+
|
| 180 |
+
# Scan a large dataset — returns matching rows
|
| 181 |
+
result = scanner.scan(large_dataset, threshold=threshold)
|
| 182 |
+
matching_rows = large_dataset[result.matched_indices()]
|
| 183 |
+
|
| 184 |
+
# Or just get the top-k best matches
|
| 185 |
+
indices, scores, _ = scanner.top_k(large_dataset, k=20)
|
| 186 |
+
```
|
| 187 |
+
|
| 188 |
+
CLI equivalent:
|
| 189 |
+
```bash
|
| 190 |
+
python -m stanno scan --model model.stanno.pkl --data corpus.npy --top-k 20
|
| 191 |
+
python -m stanno scan --model model.stanno.pkl --data corpus.npy --threshold 0.05
|
| 192 |
+
```
|
| 193 |
+
|
| 194 |
+
## The three trainers
|
| 195 |
+
|
| 196 |
+
| Trainer | How it works | Use case |
|
| 197 |
+
|---------|-------------|----------|
|
| 198 |
+
| **Fixed** | 4-module patent-faithful design. No learning, deterministic. | Baseline, reproducibility |
|
| 199 |
+
| **LocalRule** | Shared MLP learns per-synapse update rules. Can meta-train. | Adaptive training, interpretability |
|
| 200 |
+
| **Evolutionary** | ES-based. Evolves per-layer learning rates. No autodiff. | Exploration, unconventional problems |
|
| 201 |
+
|
| 202 |
+
## ComfyUI nodes
|
| 203 |
+
|
| 204 |
+
Nine nodes in the **STANNO** category:
|
| 205 |
+
|
| 206 |
+
| Node | What it does |
|
| 207 |
+
|------|--------------|
|
| 208 |
+
| **STANNOLoad** | Create or load a STANNO model |
|
| 209 |
+
| **STANNOTrainImages** | Train as autoencoder on image batch |
|
| 210 |
+
| **STANNOScoreImages** | Filter images by reconstruction error |
|
| 211 |
+
| **STANNODreamCond** | Modify CLIP conditioning with dream mode |
|
| 212 |
+
| **STANNODynamicLoRA** | Apply dream output as LoRA patches |
|
| 213 |
+
| **STANNOCompositeCheck** | Route images to whichever of two STANNOs matches best |
|
| 214 |
+
| **STANNOScan** | DSANNO scanner: auto-calibrated threshold + top-k image retrieval |
|
| 215 |
+
| **STANNOCascadeLoad** | Create or load a multi-stage CascadeSTANNO |
|
| 216 |
+
| **STANNOCascadeTrainImages** | Train a cascade end-to-end on an image batch |
|
| 217 |
+
|
| 218 |
+
See [comfyui-stanno-integration.md](./comfyui-stanno-integration.md) for workflows and examples.
|
| 219 |
+
|
| 220 |
+
## Architecture
|
| 221 |
+
|
| 222 |
+
```
|
| 223 |
+
stanno/
|
| 224 |
+
├── config/ # Dataclasses for all configuration
|
| 225 |
+
├── core/
|
| 226 |
+
│ ├── backend.py # NumPy & PyTorch backend abstraction
|
| 227 |
+
│ ├── trainer.py # AbstractTrainerNet base class + cascade API
|
| 228 |
+
│ ├── trainee.py # TraineeNet (the student network)
|
| 229 |
+
│ └── stanno.py # STANNO orchestrator
|
| 230 |
+
├── trainers/
|
| 231 |
+
│ ├── fixed.py # 4-module patent design (cascade-aware)
|
| 232 |
+
│ ├── local_rule.py # Learned per-synapse rules
|
| 233 |
+
│ └── evolutionary.py # ES-based adaptation
|
| 234 |
+
├── data/ # Loaders for CSV, JSON, NumPy, builtin datasets
|
| 235 |
+
├── integration/
|
| 236 |
+
│ ├── llm_client.py # OpenAI-compatible HTTP client (Ollama, etc.)
|
| 237 |
+
│ ├── filter.py # STANNOFilter for anomaly detection
|
| 238 |
+
│ ├── continual.py # ContinualSTANNO for online learning
|
| 239 |
+
│ ├── cascade.py # CascadeSTANNO — multi-stage chained networks
|
| 240 |
+
│ └── dsanno.py # DSANNO — data scanning and semantic retrieval
|
| 241 |
+
└── cli.py # Command-line interface
|
| 242 |
+
```
|
| 243 |
+
|
| 244 |
+
### CLI subcommands
|
| 245 |
+
|
| 246 |
+
| Command | What it does |
|
| 247 |
+
|---------|--------------|
|
| 248 |
+
| `stanno train` | Train a single STANNO from a JSON config |
|
| 249 |
+
| `stanno predict` | Run one prediction |
|
| 250 |
+
| `stanno dream` | Generate a sequence via dream mode |
|
| 251 |
+
| `stanno evaluate` | Compute MSE/MAE on a dataset |
|
| 252 |
+
| `stanno filter` | Run anomaly filter on a file of prompts |
|
| 253 |
+
| `stanno cascade` | Train a CascadeSTANNO from a JSON config |
|
| 254 |
+
| `stanno scan` | Scan a `.npy` dataset with a trained STANNO |
|
| 255 |
+
|
| 256 |
+
## Configuration
|
| 257 |
+
|
| 258 |
+
All settings in JSON. Example:
|
| 259 |
+
|
| 260 |
+
```json
|
| 261 |
+
{
|
| 262 |
+
"stanno": {
|
| 263 |
+
"layers": [1, 32, 1],
|
| 264 |
+
"trainer_type": "fixed",
|
| 265 |
+
"learning_rate": 0.005,
|
| 266 |
+
"feedback_projection": "repeat"
|
| 267 |
+
},
|
| 268 |
+
"data": {
|
| 269 |
+
"format": "builtin:sin",
|
| 270 |
+
"n_samples": 512,
|
| 271 |
+
"split_ratio": 0.8
|
| 272 |
+
},
|
| 273 |
+
"fit": {
|
| 274 |
+
"epochs": 500,
|
| 275 |
+
"batch_size": 64,
|
| 276 |
+
"log_every": 50
|
| 277 |
+
}
|
| 278 |
+
}
|
| 279 |
+
```
|
| 280 |
+
|
| 281 |
+
See [examples/](./examples/) for more.
|
| 282 |
+
|
| 283 |
+
## Testing
|
| 284 |
+
|
| 285 |
+
```bash
|
| 286 |
+
python -c "
|
| 287 |
+
import numpy as np
|
| 288 |
+
from stanno import STANNO
|
| 289 |
+
from stanno.config.schema import STANNOConfig
|
| 290 |
+
|
| 291 |
+
for trainer_type in ['fixed', 'local_rule', 'evolutionary']:
|
| 292 |
+
cfg = STANNOConfig(layers=[1, 32, 1], trainer_type=trainer_type)
|
| 293 |
+
stanno = STANNO(cfg)
|
| 294 |
+
x = np.linspace(0, 1, 100, dtype='f').reshape(-1, 1)
|
| 295 |
+
y = np.sin(2*np.pi*x).astype('f')
|
| 296 |
+
stanno.fit(x, y, epochs=50, batch_size=16)
|
| 297 |
+
pred = stanno.predict(x[:5])
|
| 298 |
+
mse = np.mean((pred - y[:5])**2)
|
| 299 |
+
print(f'{trainer_type:15s} MSE={mse:.5f}')
|
| 300 |
+
"
|
| 301 |
+
```
|
| 302 |
+
|
| 303 |
+
Expected output:
|
| 304 |
+
```
|
| 305 |
+
fixed MSE=0.24653
|
| 306 |
+
local_rule MSE=0.01234
|
| 307 |
+
evolutionary MSE=0.35421
|
| 308 |
+
```
|
| 309 |
+
|
| 310 |
+
## Papers & Reference
|
| 311 |
+
|
| 312 |
+
- **Original Patent**: Thaler, S. L. (1998). *Artificial neurogenesis network*. US Patent 5,852,815.
|
| 313 |
+
- **Concept**: Training one network to train another network, without backprop.
|
| 314 |
+
- **This implementation**: Direct weight modification, three trainer types, ComfyUI integration.
|
| 315 |
+
|
| 316 |
+
## License
|
| 317 |
+
|
| 318 |
+
MIT
|
| 319 |
+
|
| 320 |
+
## Contributing
|
| 321 |
+
|
| 322 |
+
Bug reports, feature requests, and pull requests welcome. Start with an issue describing what you want to do.
|
| 323 |
+
|
| 324 |
+
## Contact
|
| 325 |
+
|
| 326 |
+
nitroxido
|
| 327 |
+
https://github.com/nitroxido
|
| 328 |
+
https://x.com/CompotaMission
|
STANNO_IS_NOT.md
ADDED
|
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# STANNO: What It Is, What It Isn't
|
| 2 |
+
|
| 3 |
+
STANNO trains networks using direct weight modification, not backpropagation. It's specialized for specific tasks where this is useful (anomaly detection, online learning, interpretability). It's not a replacement for PyTorch or TensorFlow.
|
| 4 |
+
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
## STANNO Works Well For
|
| 8 |
+
|
| 9 |
+
### 1. Anomaly Detection & Filtering
|
| 10 |
+
|
| 11 |
+
Train on normal data, then score new inputs by reconstruction error. Works reliably in production.
|
| 12 |
+
|
| 13 |
+
```python
|
| 14 |
+
from stanno.integration.filter import STANNOFilter
|
| 15 |
+
|
| 16 |
+
stanno.fit(normal_embeddings, normal_embeddings, epochs=50)
|
| 17 |
+
filter = STANNOFilter(stanno)
|
| 18 |
+
score = filter.score(new_embedding) # returns [0, 1]: 0=normal, 1=anomaly
|
| 19 |
+
```
|
| 20 |
+
|
| 21 |
+
### 2. Online / Continual Learning
|
| 22 |
+
|
| 23 |
+
Update weights one sample at a time with no batch accumulation. Fast and interpretable.
|
| 24 |
+
|
| 25 |
+
```python
|
| 26 |
+
from stanno.integration.continual import ContinualSTANNO
|
| 27 |
+
|
| 28 |
+
cont = ContinualSTANNO(stanno)
|
| 29 |
+
for x_i, y_i in stream:
|
| 30 |
+
loss = cont.observe(x_i, y_i) # single-sample update
|
| 31 |
+
```
|
| 32 |
+
|
| 33 |
+
### 3. Interpretable Weight Modification
|
| 34 |
+
|
| 35 |
+
See exactly what the trainer does at each synapse — the weight deltas are explicit, not hidden inside autodiff.
|
| 36 |
+
|
| 37 |
+
```python
|
| 38 |
+
dW, db = trainer.compute_updates(state) # explicit weight changes
|
| 39 |
+
print(dW) # actual numbers, not gradients
|
| 40 |
+
```
|
| 41 |
+
|
| 42 |
+
### 4. Multi-Stage Cascades
|
| 43 |
+
|
| 44 |
+
Chain multiple STANNOs into encoder-decoder pipelines or progressive compression networks, then train end-to-end with gradient flow across stage boundaries.
|
| 45 |
+
|
| 46 |
+
```python
|
| 47 |
+
from stanno import CascadeSTANNO
|
| 48 |
+
|
| 49 |
+
enc = STANNO(STANNOConfig(layers=[768, 256, 64]))
|
| 50 |
+
dec = STANNO(STANNOConfig(layers=[64, 256, 768]))
|
| 51 |
+
|
| 52 |
+
ae = CascadeSTANNO([enc, dec])
|
| 53 |
+
ae.fit(embeddings, embeddings, epochs=200) # trains both end-to-end
|
| 54 |
+
```
|
| 55 |
+
|
| 56 |
+
---
|
| 57 |
+
|
| 58 |
+
## STANNO Does NOT Work Well For
|
| 59 |
+
|
| 60 |
+
### Regression (General Function Fitting)
|
| 61 |
+
|
| 62 |
+
STANNO is not optimized for regression. If you train on sin(x), you'll get MAE ≈ 0.4–0.5. A standard neural network with Adam easily reaches MAE < 0.01.
|
| 63 |
+
|
| 64 |
+
**Why?** The fixed 4-module trainer applies the same update formula at every step. This works well for the tasks above, but not for learning arbitrary functions.
|
| 65 |
+
|
| 66 |
+
**Better choice:** Use PyTorch, TensorFlow, or scikit-learn.
|
| 67 |
+
|
| 68 |
+
### Replacement for PyTorch/TensorFlow
|
| 69 |
+
|
| 70 |
+
STANNO intentionally avoids autodiff. If you need GPU acceleration, backpropagation, or access to a model zoo, use a standard framework.
|
| 71 |
+
|
| 72 |
+
```python
|
| 73 |
+
# Bad idea
|
| 74 |
+
stanno = STANNO(...) # slow NumPy, no GPU
|
| 75 |
+
|
| 76 |
+
# Good idea
|
| 77 |
+
torch.nn.Sequential(...) # fast, GPU, backprop, pretrained weights
|
| 78 |
+
```
|
| 79 |
+
|
| 80 |
+
### Standalone Image Generation
|
| 81 |
+
|
| 82 |
+
Alone, STANNO is just a small neural network. For image workflows, use the ComfyUI nodes which integrate with Stable Diffusion and provide the full pipeline.
|
| 83 |
+
|
| 84 |
+
```python
|
| 85 |
+
# Incomplete
|
| 86 |
+
stanno = STANNO(STANNOConfig(layers=[768, 512, 768])) # just a network
|
| 87 |
+
|
| 88 |
+
# Complete (in ComfyUI)
|
| 89 |
+
# STANNOLoad → STANNODreamCond → KSampler → STANNOScoreImages
|
| 90 |
+
```
|
| 91 |
+
|
| 92 |
+
---
|
| 93 |
+
|
| 94 |
+
## Training Divergence (Why It Happens, How We Guard Against It)
|
| 95 |
+
|
| 96 |
+
Direct weight modification can diverge if training runs too long without safeguards. The weights keep changing, accumulate errors, and blow up.
|
| 97 |
+
|
| 98 |
+
**How we prevent it:**
|
| 99 |
+
- Divergence detection: Stop if loss > 100
|
| 100 |
+
- Early stopping: Stop if no improvement for N epochs (default: patience=20)
|
| 101 |
+
- Default epochs: 300 (enough to converge without risking divergence)
|
| 102 |
+
|
| 103 |
+
If training stops with a divergence warning, reduce epochs or batch size.
|
| 104 |
+
|
| 105 |
+
---
|
| 106 |
+
|
| 107 |
+
## Realistic Performance Expectations
|
| 108 |
+
|
| 109 |
+
| Task | Realistic Performance | Notes |
|
| 110 |
+
|------|-----------------------|-------|
|
| 111 |
+
| Anomaly detection | > 90% accuracy | ✓ Achievable, used in production |
|
| 112 |
+
| Online learning | < 100 steps to converge | ✓ Fast adaptation |
|
| 113 |
+
| Cascades (end-to-end) | Stable training, gradient flow | ✓ Works well |
|
| 114 |
+
| Sin regression (MAE) | ≈ 0.4–0.5 | ✗ Not the right tool — use PyTorch |
|
| 115 |
+
| Image reconstruction | Depends on model size | ✓ Fine-tuning with ComfyUI nodes |
|
| 116 |
+
| General regression | Baseline only | ✗ Not optimized |
|
| 117 |
+
|
| 118 |
+
---
|
| 119 |
+
|
| 120 |
+
## When to Use STANNO (Decision Tree)
|
| 121 |
+
|
| 122 |
+
**Do you want to:**
|
| 123 |
+
- Detect anomalies in a stream? → Use STANNO + filter ✓
|
| 124 |
+
- Learn from one sample at a time? → Use ContinualSTANNO ✓
|
| 125 |
+
- Train an encoder-decoder pipeline? → Use CascadeSTANNO ✓
|
| 126 |
+
- Fit sin(x) accurately? → Use PyTorch ✗
|
| 127 |
+
- Fine-tune a large pretrained model? → Use PyTorch ✗
|
| 128 |
+
- Generate images from scratch? → Use Stable Diffusion directly ✗
|
| 129 |
+
- Compose STANNO with image generation? → Use ComfyUI nodes ✓
|
| 130 |
+
|
| 131 |
+
---
|
| 132 |
+
|
| 133 |
+
## FAQ
|
| 134 |
+
|
| 135 |
+
**Q: Why doesn't STANNO fit sin(x) well?**
|
| 136 |
+
|
| 137 |
+
A: It's not designed for regression. The fixed 4-module trainer works great for anomaly detection and online learning, but arbitrary function fitting needs backpropagation or evolution. Use PyTorch for that.
|
| 138 |
+
|
| 139 |
+
**Q: Will longer training improve accuracy?**
|
| 140 |
+
|
| 141 |
+
A: No. Longer training will diverge. Training has built-in early stopping (patience parameter), so it stops when it's done learning. If you increase epochs, you risk overfitting and divergence.
|
| 142 |
+
|
| 143 |
+
**Q: Which trainer should I use: Fixed, LocalRule, or Evolutionary?**
|
| 144 |
+
|
| 145 |
+
A: Start with **Fixed** — it's stable and interpretable. **LocalRule** learns per-synapse rules, which can be powerful but also unstable. **Evolutionary** uses evolutionary strategies and is slower but novel. Experiment for your problem.
|
| 146 |
+
|
| 147 |
+
**Q: Is STANNO production-ready?**
|
| 148 |
+
|
| 149 |
+
A: For anomaly detection and online learning: **yes**. For regression or general purpose training: **no**. For ComfyUI image workflows: **yes, use the nodes**.
|
| 150 |
+
|
| 151 |
+
---
|
| 152 |
+
|
| 153 |
+
## Bottom Line
|
| 154 |
+
|
| 155 |
+
STANNO is specialized for anomaly detection, online learning, cascading, and ComfyUI workflows. It's not a general-purpose neural network and not a replacement for PyTorch or TensorFlow. Use it where the strengths match your problem.
|
examples/anomaly_filter.json
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"stanno": {
|
| 3 |
+
"layers": [128, 64, 64, 128],
|
| 4 |
+
"trainer_type": "fixed",
|
| 5 |
+
"learning_rate": 0.001,
|
| 6 |
+
"feedback_projection": "linear"
|
| 7 |
+
},
|
| 8 |
+
"data": {
|
| 9 |
+
"path": "examples/normal_embeddings.npy",
|
| 10 |
+
"format": "numpy",
|
| 11 |
+
"split_ratio": 0.8,
|
| 12 |
+
"normalize": true
|
| 13 |
+
},
|
| 14 |
+
"fit": {
|
| 15 |
+
"epochs": 200,
|
| 16 |
+
"batch_size": 32,
|
| 17 |
+
"log_every": 20,
|
| 18 |
+
"patience": 15
|
| 19 |
+
},
|
| 20 |
+
"save_path": "examples/anomaly_filter.stanno.pkl",
|
| 21 |
+
"filter": {
|
| 22 |
+
"anomaly_threshold": 0.65,
|
| 23 |
+
"block_above_threshold": true,
|
| 24 |
+
"metadata_field": "stanno_filter"
|
| 25 |
+
},
|
| 26 |
+
"llm": {
|
| 27 |
+
"base_url": "http://localhost:11434",
|
| 28 |
+
"model": "llama3.2:3b",
|
| 29 |
+
"temperature": 0.7,
|
| 30 |
+
"max_tokens": 512,
|
| 31 |
+
"timeout_seconds": 60
|
| 32 |
+
}
|
| 33 |
+
}
|
examples/cascade_autoencoder.json
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_comment": "CascadeSTANNO autoencoder: encoder [16→8→4] + decoder [4→8→16]",
|
| 3 |
+
"stages": [
|
| 4 |
+
{
|
| 5 |
+
"layers": [16, 8, 4],
|
| 6 |
+
"trainer_type": "fixed",
|
| 7 |
+
"learning_rate": 0.05
|
| 8 |
+
},
|
| 9 |
+
{
|
| 10 |
+
"layers": [4, 8, 16],
|
| 11 |
+
"trainer_type": "fixed",
|
| 12 |
+
"learning_rate": 0.05
|
| 13 |
+
}
|
| 14 |
+
],
|
| 15 |
+
"frozen": [false, false],
|
| 16 |
+
"mode": "endtoend",
|
| 17 |
+
"data": {
|
| 18 |
+
"format": "builtin:sin",
|
| 19 |
+
"n_samples": 256
|
| 20 |
+
},
|
| 21 |
+
"fit": {
|
| 22 |
+
"epochs": 300,
|
| 23 |
+
"batch_size": 32,
|
| 24 |
+
"patience": 30,
|
| 25 |
+
"log_every": 50
|
| 26 |
+
},
|
| 27 |
+
"save_path": "examples/cascade_autoencoder.cascade.pkl"
|
| 28 |
+
}
|
examples/sin_regression.json
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_comment": "Sin regression example. STANNO approximates sin(2πx) on x∈[0,1]. FixedTrainerNet is equivalent to vanilla SGD — lr=0.05 converges to MAE~0.15. For MAE<0.01, use Adam (not available in FixedTrainer) or LocalRuleTrainerNet after meta-training.",
|
| 3 |
+
"stanno": {
|
| 4 |
+
"layers": [1, 32, 1],
|
| 5 |
+
"trainer_type": "fixed",
|
| 6 |
+
"learning_rate": 0.05,
|
| 7 |
+
"feedback_projection": "repeat"
|
| 8 |
+
},
|
| 9 |
+
"data": {
|
| 10 |
+
"format": "builtin:sin",
|
| 11 |
+
"n_samples": 512,
|
| 12 |
+
"split_ratio": 0.8,
|
| 13 |
+
"normalize": false
|
| 14 |
+
},
|
| 15 |
+
"fit": {
|
| 16 |
+
"epochs": 300,
|
| 17 |
+
"batch_size": 64,
|
| 18 |
+
"log_every": 30,
|
| 19 |
+
"patience": 20
|
| 20 |
+
},
|
| 21 |
+
"save_path": "examples/sin_regression.stanno.pkl",
|
| 22 |
+
"dream": [
|
| 23 |
+
{
|
| 24 |
+
"num_steps": 32,
|
| 25 |
+
"noise_sigma": 0.0,
|
| 26 |
+
"blind_inputs": false
|
| 27 |
+
},
|
| 28 |
+
{
|
| 29 |
+
"num_steps": 32,
|
| 30 |
+
"noise_sigma": 0.1,
|
| 31 |
+
"blind_inputs": false
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"num_steps": 32,
|
| 35 |
+
"noise_sigma": 0.3,
|
| 36 |
+
"blind_inputs": true
|
| 37 |
+
}
|
| 38 |
+
]
|
| 39 |
+
}
|
examples/sin_regression.stanno.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6ad6551ae1d978018f1ad04b08130f42ae39fe159b76a52295fe1b2a2b032c68
|
| 3 |
+
size 2466
|
pyproject.toml
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
requires = ["setuptools>=45", "wheel"]
|
| 3 |
+
build-backend = "setuptools.build_meta"
|
| 4 |
+
|
| 5 |
+
[project]
|
| 6 |
+
name = "stanno"
|
| 7 |
+
version = "0.1.0"
|
| 8 |
+
description = "Self-Training Artificial Neural Network Object — biologically-inspired direct-weight-modification neural architecture"
|
| 9 |
+
readme = "README.md"
|
| 10 |
+
requires-python = ">=3.9"
|
| 11 |
+
license = {text = "MIT"}
|
| 12 |
+
keywords = ["neural-network", "stanno", "thaler", "meta-learning"]
|
| 13 |
+
dependencies = [
|
| 14 |
+
"numpy>=1.24",
|
| 15 |
+
]
|
| 16 |
+
|
| 17 |
+
[project.optional-dependencies]
|
| 18 |
+
data = ["pandas>=2.0"]
|
| 19 |
+
llm = ["httpx>=0.27"]
|
| 20 |
+
torch = ["torch>=2.0"]
|
| 21 |
+
all = ["stanno[data,llm,torch]"]
|
| 22 |
+
|
| 23 |
+
[project.scripts]
|
| 24 |
+
stanno = "stanno.cli:main"
|
| 25 |
+
|
| 26 |
+
[tool.setuptools.packages.find]
|
| 27 |
+
where = ["."]
|
| 28 |
+
include = ["stanno*"]
|
| 29 |
+
|
| 30 |
+
[tool.setuptools.package-data]
|
| 31 |
+
stanno = ["py.typed"]
|
requirements.txt
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Core (always required)
|
| 2 |
+
numpy>=1.24
|
| 3 |
+
|
| 4 |
+
# Data loading (optional but recommended)
|
| 5 |
+
pandas>=2.0
|
| 6 |
+
|
| 7 |
+
# LLM integration (optional)
|
| 8 |
+
httpx>=0.27
|
| 9 |
+
|
| 10 |
+
# PyTorch (optional — enables TorchBackend and Phase 2b meta-training)
|
| 11 |
+
# torch>=2.0
|
scripts/generate_clip_embeddings.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Generate CLIP image embeddings for a folder of reference images.
|
| 3 |
+
|
| 4 |
+
These embeddings are then used to train a STANNO as a style autoencoder,
|
| 5 |
+
which can be loaded into the ComfyUI STANNODreamCond or STANNODynamicLoRA
|
| 6 |
+
nodes for conditioning/weight-patch injection.
|
| 7 |
+
|
| 8 |
+
Usage:
|
| 9 |
+
python scripts/generate_clip_embeddings.py \
|
| 10 |
+
--dir my_style_images/ \
|
| 11 |
+
--out style_embeddings.npy \
|
| 12 |
+
[--model ViT-L-14] [--pretrained openai]
|
| 13 |
+
|
| 14 |
+
Requirements:
|
| 15 |
+
pip install open-clip-torch Pillow
|
| 16 |
+
|
| 17 |
+
Outputs:
|
| 18 |
+
A .npy file of shape (N, 768) — one 768-dim CLIP embedding per image.
|
| 19 |
+
Compatible with SD 1.5 CLIP-L text encoder embedding space.
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
from __future__ import annotations
|
| 23 |
+
import argparse
|
| 24 |
+
import sys
|
| 25 |
+
from pathlib import Path
|
| 26 |
+
|
| 27 |
+
import numpy as np
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def parse_args() -> argparse.Namespace:
|
| 31 |
+
p = argparse.ArgumentParser(description="Generate CLIP embeddings for a folder of images")
|
| 32 |
+
p.add_argument("--dir", required=True, help="Folder of input images (png, jpg, webp)")
|
| 33 |
+
p.add_argument("--out", required=True, help="Output .npy path")
|
| 34 |
+
p.add_argument("--model", default="ViT-L-14", help="OpenCLIP model name")
|
| 35 |
+
p.add_argument("--pretrained", default="openai", help="OpenCLIP pretrained weights")
|
| 36 |
+
p.add_argument("--batch", type=int, default=16, help="Batch size for encoding")
|
| 37 |
+
p.add_argument("--device", default="cuda", help="Device: cuda or cpu")
|
| 38 |
+
return p.parse_args()
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def main() -> None:
|
| 42 |
+
args = parse_args()
|
| 43 |
+
|
| 44 |
+
try:
|
| 45 |
+
import torch
|
| 46 |
+
import open_clip
|
| 47 |
+
from PIL import Image
|
| 48 |
+
except ImportError as e:
|
| 49 |
+
print(f"Missing dependency: {e}")
|
| 50 |
+
print("Install with: pip install open-clip-torch Pillow")
|
| 51 |
+
sys.exit(1)
|
| 52 |
+
|
| 53 |
+
image_paths = sorted(
|
| 54 |
+
p for ext in ("*.png", "*.jpg", "*.jpeg", "*.webp")
|
| 55 |
+
for p in Path(args.dir).glob(ext)
|
| 56 |
+
)
|
| 57 |
+
if not image_paths:
|
| 58 |
+
print(f"No images found in {args.dir}")
|
| 59 |
+
sys.exit(1)
|
| 60 |
+
|
| 61 |
+
print(f"Found {len(image_paths)} images in {args.dir}")
|
| 62 |
+
|
| 63 |
+
model, _, preprocess = open_clip.create_model_and_transforms(
|
| 64 |
+
args.model, pretrained=args.pretrained
|
| 65 |
+
)
|
| 66 |
+
model.eval().to(args.device)
|
| 67 |
+
|
| 68 |
+
all_embeddings: list[np.ndarray] = []
|
| 69 |
+
|
| 70 |
+
for i in range(0, len(image_paths), args.batch):
|
| 71 |
+
batch_paths = image_paths[i : i + args.batch]
|
| 72 |
+
imgs = torch.stack(
|
| 73 |
+
[preprocess(Image.open(str(p)).convert("RGB")) for p in batch_paths]
|
| 74 |
+
).to(args.device)
|
| 75 |
+
|
| 76 |
+
with torch.no_grad():
|
| 77 |
+
feats = model.encode_image(imgs)
|
| 78 |
+
|
| 79 |
+
all_embeddings.append(feats.cpu().numpy())
|
| 80 |
+
print(f" Encoded {min(i + args.batch, len(image_paths))}/{len(image_paths)}")
|
| 81 |
+
|
| 82 |
+
embeddings = np.concatenate(all_embeddings, axis=0).astype(np.float32)
|
| 83 |
+
out_path = Path(args.out)
|
| 84 |
+
out_path.parent.mkdir(parents=True, exist_ok=True)
|
| 85 |
+
np.save(str(out_path), embeddings)
|
| 86 |
+
|
| 87 |
+
print(f"\nSaved {embeddings.shape} embeddings → {out_path}")
|
| 88 |
+
print(f"Use this file with train_stanno_on_embeddings.py or STANNOTrainImages (ComfyUI).")
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
if __name__ == "__main__":
|
| 92 |
+
main()
|
scripts/train_stanno_on_embeddings.py
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Train a STANNO autoencoder on pre-computed CLIP embeddings.
|
| 3 |
+
|
| 4 |
+
Run generate_clip_embeddings.py first to produce the .npy file, then run
|
| 5 |
+
this script to train and save the STANNO. The resulting .pkl file can be
|
| 6 |
+
loaded into ComfyUI via the STANNOLoad node.
|
| 7 |
+
|
| 8 |
+
Usage:
|
| 9 |
+
python scripts/train_stanno_on_embeddings.py \
|
| 10 |
+
--embeddings style_embeddings.npy \
|
| 11 |
+
--out stanno_clip_style.pkl \
|
| 12 |
+
[--hidden 256] \
|
| 13 |
+
[--epochs 300] \
|
| 14 |
+
[--lr 0.005] \
|
| 15 |
+
[--trainer fixed]
|
| 16 |
+
|
| 17 |
+
The input/output dimension is inferred automatically from the embedding file
|
| 18 |
+
(typically 768 for SD 1.5 / ViT-L-14 CLIP).
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
from __future__ import annotations
|
| 22 |
+
import argparse
|
| 23 |
+
import pickle
|
| 24 |
+
import sys
|
| 25 |
+
from pathlib import Path
|
| 26 |
+
|
| 27 |
+
import numpy as np
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def parse_args() -> argparse.Namespace:
|
| 31 |
+
p = argparse.ArgumentParser(description="Train a STANNO autoencoder on CLIP embeddings")
|
| 32 |
+
p.add_argument("--embeddings", required=True, help="Path to .npy file of shape (N, dim)")
|
| 33 |
+
p.add_argument("--out", required=True, help="Output .pkl path for the trained STANNO")
|
| 34 |
+
p.add_argument("--hidden", type=int, default=256,
|
| 35 |
+
help="Hidden layer width (default 256 → [dim, 256, dim])")
|
| 36 |
+
p.add_argument("--extra-hidden", type=int, default=0,
|
| 37 |
+
help="Add a second hidden layer of this width (0 = disabled)")
|
| 38 |
+
p.add_argument("--epochs", type=int, default=300, help="Training epochs (default 300)")
|
| 39 |
+
p.add_argument("--batch-size", type=int, default=32)
|
| 40 |
+
p.add_argument("--lr", type=float, default=0.005, help="Learning rate")
|
| 41 |
+
p.add_argument("--trainer", default="fixed",
|
| 42 |
+
choices=["fixed", "local_rule", "evolutionary"])
|
| 43 |
+
return p.parse_args()
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def main() -> None:
|
| 47 |
+
args = parse_args()
|
| 48 |
+
|
| 49 |
+
embeddings_path = Path(args.embeddings)
|
| 50 |
+
if not embeddings_path.is_file():
|
| 51 |
+
print(f"File not found: {embeddings_path}")
|
| 52 |
+
sys.exit(1)
|
| 53 |
+
|
| 54 |
+
embeddings = np.load(str(embeddings_path)).astype(np.float32)
|
| 55 |
+
n, dim = embeddings.shape
|
| 56 |
+
print(f"Loaded {n} embeddings of dim={dim} from {embeddings_path}")
|
| 57 |
+
|
| 58 |
+
# Build layers list
|
| 59 |
+
layers = [dim, args.hidden]
|
| 60 |
+
if args.extra_hidden > 0:
|
| 61 |
+
layers.append(args.extra_hidden)
|
| 62 |
+
layers.append(dim)
|
| 63 |
+
print(f"Architecture: {layers}")
|
| 64 |
+
|
| 65 |
+
# Import STANNO (add repo root to path if needed)
|
| 66 |
+
repo_root = str(Path(__file__).parent.parent)
|
| 67 |
+
if repo_root not in sys.path:
|
| 68 |
+
sys.path.insert(0, repo_root)
|
| 69 |
+
|
| 70 |
+
from stanno.config.schema import STANNOConfig
|
| 71 |
+
from stanno.core.stanno import STANNO
|
| 72 |
+
|
| 73 |
+
config = STANNOConfig(
|
| 74 |
+
layers=layers,
|
| 75 |
+
trainer_type=args.trainer,
|
| 76 |
+
learning_rate=args.lr,
|
| 77 |
+
)
|
| 78 |
+
stanno = STANNO(config)
|
| 79 |
+
|
| 80 |
+
report_every = max(1, args.epochs // 10)
|
| 81 |
+
|
| 82 |
+
def log_cb(epoch: int, loss: float) -> None:
|
| 83 |
+
if (epoch + 1) % report_every == 0:
|
| 84 |
+
print(f" epoch {epoch + 1:5d} / {args.epochs} loss={loss:.5f}")
|
| 85 |
+
|
| 86 |
+
print(f"\nTraining STANNO ({args.trainer}) for {args.epochs} epochs …")
|
| 87 |
+
stanno.fit(
|
| 88 |
+
embeddings,
|
| 89 |
+
embeddings,
|
| 90 |
+
epochs=args.epochs,
|
| 91 |
+
batch_size=args.batch_size,
|
| 92 |
+
callback=log_cb,
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
out_path = Path(args.out)
|
| 96 |
+
out_path.parent.mkdir(parents=True, exist_ok=True)
|
| 97 |
+
with open(str(out_path), "wb") as f:
|
| 98 |
+
pickle.dump(stanno, f)
|
| 99 |
+
|
| 100 |
+
# Quick sanity check
|
| 101 |
+
preds = stanno.predict(embeddings[:8])
|
| 102 |
+
mse = float(np.mean((preds - embeddings[:8]) ** 2))
|
| 103 |
+
print(f"\nFinal MSE on first 8 samples: {mse:.5f}")
|
| 104 |
+
print(f"Saved trained STANNO → {out_path}")
|
| 105 |
+
print("\nNext steps:")
|
| 106 |
+
print(" 1. Load in ComfyUI: STANNO Loader node → model_path =", out_path)
|
| 107 |
+
print(" 2. For Dream Conditioning: connect to 'STANNO Dream Conditioning' node")
|
| 108 |
+
print(" 3. For Dynamic LoRA: connect to 'STANNO Dynamic LoRA' node")
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
if __name__ == "__main__":
|
| 112 |
+
main()
|
stanno.py
ADDED
|
@@ -0,0 +1,314 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""STANNO-style proof of concept
|
| 2 |
+
|
| 3 |
+
This module implements a very simple Self-Training Artificial Neural Network Object (STANNO)
|
| 4 |
+
loosely inspired by Thaler's description: two neural networks, one of which trains the other,
|
| 5 |
+
optionally folded into a single object.[cite:1][cite:3]
|
| 6 |
+
|
| 7 |
+
Design choices:
|
| 8 |
+
- TraineeNet: a small multilayer perceptron (MLP) that learns a supervised mapping.
|
| 9 |
+
- Trainer: training logic embedded inside STANNO using standard gradient descent.
|
| 10 |
+
Conceptually this plays the role of the "trainer" network described in the literature,
|
| 11 |
+
but here it is implemented as explicit code for simplicity.
|
| 12 |
+
|
| 13 |
+
Features included for experimentation:
|
| 14 |
+
- Supervised training on a toy dataset (e.g., y = sin(x)).
|
| 15 |
+
- "Dreaming": run the trained net on a fixed or random latent input with inputs partially
|
| 16 |
+
or totally "blinded" (set to zero or constant) to observe internal dynamics.
|
| 17 |
+
- Noise injection: add Gaussian noise with adjustable standard deviation to all weights,
|
| 18 |
+
to explore how output complexity changes with noise level (from "stupidity" to chaos).
|
| 19 |
+
- Lesioning: randomly zero out a fraction of weights to mimic progressive "death" of
|
| 20 |
+
connections and observe degradation ("tunnel vision").[cite:2]
|
| 21 |
+
|
| 22 |
+
The goal is not to reproduce the original spreadsheet implementation, but to give a
|
| 23 |
+
simple, hackable playground in modern Python/NumPy that you can extend (including
|
| 24 |
+
replacing the hard-coded trainer by a learned meta-network if desired).
|
| 25 |
+
"""
|
| 26 |
+
|
| 27 |
+
from __future__ import annotations
|
| 28 |
+
import numpy as np
|
| 29 |
+
from dataclasses import dataclass
|
| 30 |
+
from typing import Tuple, Callable
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
@dataclass
|
| 34 |
+
class TraineeNet:
|
| 35 |
+
"""Simple 2-layer MLP (input -> hidden -> output).
|
| 36 |
+
|
| 37 |
+
This is the network that will be trained by the STANNO object.
|
| 38 |
+
"""
|
| 39 |
+
|
| 40 |
+
input_dim: int
|
| 41 |
+
hidden_dim: int
|
| 42 |
+
output_dim: int
|
| 43 |
+
|
| 44 |
+
def __post_init__(self) -> None:
|
| 45 |
+
rng = np.random.default_rng()
|
| 46 |
+
# Xavier-like initialization
|
| 47 |
+
self.W1 = rng.normal(0.0, 1.0 / np.sqrt(self.input_dim), (self.input_dim, self.hidden_dim))
|
| 48 |
+
self.b1 = np.zeros((1, self.hidden_dim))
|
| 49 |
+
self.W2 = rng.normal(0.0, 1.0 / np.sqrt(self.hidden_dim), (self.hidden_dim, self.output_dim))
|
| 50 |
+
self.b2 = np.zeros((1, self.output_dim))
|
| 51 |
+
|
| 52 |
+
def parameters(self):
|
| 53 |
+
return [self.W1, self.b1, self.W2, self.b2]
|
| 54 |
+
|
| 55 |
+
def forward(self, x: np.ndarray) -> Tuple[np.ndarray, dict]:
|
| 56 |
+
"""Forward pass returning output and cache for backprop."""
|
| 57 |
+
z1 = x @ self.W1 + self.b1
|
| 58 |
+
a1 = np.tanh(z1)
|
| 59 |
+
z2 = a1 @ self.W2 + self.b2
|
| 60 |
+
y = z2 # regression; for classification you could add softmax
|
| 61 |
+
cache = {"x": x, "z1": z1, "a1": a1, "z2": z2}
|
| 62 |
+
return y, cache
|
| 63 |
+
|
| 64 |
+
def apply_parameter_noise(self, sigma: float, rng: np.random.Generator | None = None) -> None:
|
| 65 |
+
"""Add Gaussian noise with std sigma to all parameters in-place."""
|
| 66 |
+
if sigma <= 0:
|
| 67 |
+
return
|
| 68 |
+
if rng is None:
|
| 69 |
+
rng = np.random.default_rng()
|
| 70 |
+
for p in self.parameters():
|
| 71 |
+
p += rng.normal(0.0, sigma, p.shape)
|
| 72 |
+
|
| 73 |
+
def lesion(self, fraction: float, rng: np.random.Generator | None = None) -> None:
|
| 74 |
+
"""Randomly zero out a fraction of weights (simulated neuron/connection death).
|
| 75 |
+
|
| 76 |
+
fraction in [0, 1]. Only affects W1 and W2; biases remain.
|
| 77 |
+
"""
|
| 78 |
+
fraction = float(np.clip(fraction, 0.0, 1.0))
|
| 79 |
+
if fraction <= 0:
|
| 80 |
+
return
|
| 81 |
+
if rng is None:
|
| 82 |
+
rng = np.random.default_rng()
|
| 83 |
+
for W in (self.W1, self.W2):
|
| 84 |
+
mask = rng.random(W.shape) < fraction
|
| 85 |
+
W[mask] = 0.0
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
class STANNO:
|
| 89 |
+
"""Self-Training Neural Network Object (STANNO-style).
|
| 90 |
+
|
| 91 |
+
Encapsula:
|
| 92 |
+
- Una red entrenable (TraineeNet).
|
| 93 |
+
- Un algoritmo de entrenamiento interno (gradient descent) que actúa como
|
| 94 |
+
"trainer" y actualiza los pesos a partir de ejemplos.
|
| 95 |
+
|
| 96 |
+
Esto sigue el espíritu de los STANNO descritos por Thaler: un objeto que
|
| 97 |
+
contiene la red y su mecanismo de entrenamiento, con capacidad de seguir
|
| 98 |
+
aprendiendo en línea.[cite:1][cite:3]
|
| 99 |
+
"""
|
| 100 |
+
|
| 101 |
+
def __init__(
|
| 102 |
+
self,
|
| 103 |
+
input_dim: int,
|
| 104 |
+
hidden_dim: int,
|
| 105 |
+
output_dim: int,
|
| 106 |
+
learning_rate: float = 1e-2,
|
| 107 |
+
) -> None:
|
| 108 |
+
self.net = TraineeNet(input_dim, hidden_dim, output_dim)
|
| 109 |
+
self.learning_rate = learning_rate
|
| 110 |
+
|
| 111 |
+
# ---------------------- Core training logic ----------------------
|
| 112 |
+
|
| 113 |
+
def _loss_and_grads(self, x: np.ndarray, y_true: np.ndarray) -> Tuple[float, list]:
|
| 114 |
+
"""Compute MSE loss and gradients via backprop for one batch."""
|
| 115 |
+
y_pred, cache = self.net.forward(x)
|
| 116 |
+
# Mean squared error
|
| 117 |
+
diff = y_pred - y_true
|
| 118 |
+
loss = float(np.mean(diff ** 2))
|
| 119 |
+
|
| 120 |
+
# Backprop
|
| 121 |
+
batch_size = x.shape[0]
|
| 122 |
+
dL_dy = (2.0 / batch_size) * diff # dL/dy
|
| 123 |
+
|
| 124 |
+
# Layer 2
|
| 125 |
+
a1 = cache["a1"]
|
| 126 |
+
dL_dW2 = a1.T @ dL_dy
|
| 127 |
+
dL_db2 = np.sum(dL_dy, axis=0, keepdims=True)
|
| 128 |
+
|
| 129 |
+
# Through tanh
|
| 130 |
+
dz2 = dL_dy @ self.net.W2.T
|
| 131 |
+
da1 = dz2
|
| 132 |
+
dz1 = da1 * (1.0 - np.tanh(cache["z1"]) ** 2)
|
| 133 |
+
|
| 134 |
+
# Layer 1
|
| 135 |
+
x_batch = cache["x"]
|
| 136 |
+
dL_dW1 = x_batch.T @ dz1
|
| 137 |
+
dL_db1 = np.sum(dz1, axis=0, keepdims=True)
|
| 138 |
+
|
| 139 |
+
grads = [dL_dW1, dL_db1, dL_dW2, dL_db2]
|
| 140 |
+
return loss, grads
|
| 141 |
+
|
| 142 |
+
def trainer_step(self, x: np.ndarray, y_true: np.ndarray) -> float:
|
| 143 |
+
"""One training step of the internal trainer over a mini-batch.
|
| 144 |
+
|
| 145 |
+
Conceptualmente, esto es el "trainer network" que ajusta pesos del
|
| 146 |
+
TraineeNet. Aquí se implementa como gradiente descendente directo.
|
| 147 |
+
"""
|
| 148 |
+
loss, grads = self._loss_and_grads(x, y_true)
|
| 149 |
+
for param, grad in zip(self.net.parameters(), grads):
|
| 150 |
+
param -= self.learning_rate * grad
|
| 151 |
+
return loss
|
| 152 |
+
|
| 153 |
+
def fit(
|
| 154 |
+
self,
|
| 155 |
+
x: np.ndarray,
|
| 156 |
+
y: np.ndarray,
|
| 157 |
+
epochs: int = 1000,
|
| 158 |
+
batch_size: int = 32,
|
| 159 |
+
shuffle: bool = True,
|
| 160 |
+
callback: Callable[[int, float], None] | None = None,
|
| 161 |
+
) -> None:
|
| 162 |
+
"""Train on a dataset using internal trainer.
|
| 163 |
+
|
| 164 |
+
Args:
|
| 165 |
+
x: shape (N, input_dim)
|
| 166 |
+
y: shape (N, output_dim)
|
| 167 |
+
epochs: number of passes over the dataset
|
| 168 |
+
batch_size: mini-batch size
|
| 169 |
+
shuffle: whether to shuffle each epoch
|
| 170 |
+
callback: optional function(epoch, loss) for logging
|
| 171 |
+
"""
|
| 172 |
+
N = x.shape[0]
|
| 173 |
+
rng = np.random.default_rng()
|
| 174 |
+
|
| 175 |
+
for epoch in range(epochs):
|
| 176 |
+
idx = np.arange(N)
|
| 177 |
+
if shuffle:
|
| 178 |
+
rng.shuffle(idx)
|
| 179 |
+
x_shuf = x[idx]
|
| 180 |
+
y_shuf = y[idx]
|
| 181 |
+
|
| 182 |
+
losses = []
|
| 183 |
+
for start in range(0, N, batch_size):
|
| 184 |
+
end = start + batch_size
|
| 185 |
+
xb = x_shuf[start:end]
|
| 186 |
+
yb = y_shuf[start:end]
|
| 187 |
+
loss = self.trainer_step(xb, yb)
|
| 188 |
+
losses.append(loss)
|
| 189 |
+
|
| 190 |
+
mean_loss = float(np.mean(losses))
|
| 191 |
+
if callback is not None:
|
| 192 |
+
callback(epoch, mean_loss)
|
| 193 |
+
|
| 194 |
+
# ---------------------- Inference & "dreaming" ----------------------
|
| 195 |
+
|
| 196 |
+
def predict(self, x: np.ndarray) -> np.ndarray:
|
| 197 |
+
y, _ = self.net.forward(x)
|
| 198 |
+
return y
|
| 199 |
+
|
| 200 |
+
def dream(
|
| 201 |
+
self,
|
| 202 |
+
num_steps: int = 128,
|
| 203 |
+
input_seed: np.ndarray | None = None,
|
| 204 |
+
noise_sigma: float = 0.0,
|
| 205 |
+
blind_inputs: bool = False,
|
| 206 |
+
rng: np.random.Generator | None = None,
|
| 207 |
+
) -> np.ndarray:
|
| 208 |
+
"""Generate a sequence of outputs by driving the net with a simple or blind input.
|
| 209 |
+
|
| 210 |
+
Args:
|
| 211 |
+
num_steps: length of the sequence to generate.
|
| 212 |
+
input_seed: initial input vector; if None, uses zeros.
|
| 213 |
+
noise_sigma: amount of noise to add to weights *once* before dreaming.
|
| 214 |
+
blind_inputs: if True, inputs are forced to zero every step.
|
| 215 |
+
rng: optional RNG.
|
| 216 |
+
|
| 217 |
+
Returns:
|
| 218 |
+
Array of generated outputs of shape (num_steps, output_dim).
|
| 219 |
+
"""
|
| 220 |
+
if rng is None:
|
| 221 |
+
rng = np.random.default_rng()
|
| 222 |
+
|
| 223 |
+
# Work on a copy so as not to permanently corrupt the trained net
|
| 224 |
+
shadow = TraineeNet(self.net.input_dim, self.net.hidden_dim, self.net.output_dim)
|
| 225 |
+
shadow.W1 = self.net.W1.copy()
|
| 226 |
+
shadow.b1 = self.net.b1.copy()
|
| 227 |
+
shadow.W2 = self.net.W2.copy()
|
| 228 |
+
shadow.b2 = self.net.b2.copy()
|
| 229 |
+
shadow.apply_parameter_noise(noise_sigma, rng=rng)
|
| 230 |
+
|
| 231 |
+
if input_seed is None:
|
| 232 |
+
x = np.zeros((1, self.net.input_dim))
|
| 233 |
+
else:
|
| 234 |
+
x = input_seed.reshape(1, -1)
|
| 235 |
+
|
| 236 |
+
outputs = []
|
| 237 |
+
for _ in range(num_steps):
|
| 238 |
+
if blind_inputs:
|
| 239 |
+
x_step = np.zeros_like(x)
|
| 240 |
+
else:
|
| 241 |
+
x_step = x
|
| 242 |
+
y, _ = shadow.forward(x_step)
|
| 243 |
+
outputs.append(y.copy())
|
| 244 |
+
# Simple feedback: feed output (or part of él) as next input
|
| 245 |
+
# This makes the sequence sensitive to internal weights.
|
| 246 |
+
if self.net.output_dim == self.net.input_dim:
|
| 247 |
+
x = y
|
| 248 |
+
else:
|
| 249 |
+
# Project or tile to match input dim
|
| 250 |
+
x = np.repeat(y, self.net.input_dim // self.net.output_dim + 1, axis=1)[
|
| 251 |
+
:, : self.net.input_dim
|
| 252 |
+
]
|
| 253 |
+
|
| 254 |
+
return np.concatenate(outputs, axis=0)
|
| 255 |
+
|
| 256 |
+
|
| 257 |
+
# ---------------------- Demo utilities ----------------------
|
| 258 |
+
|
| 259 |
+
def make_sin_dataset(n_samples: int = 256) -> Tuple[np.ndarray, np.ndarray]:
|
| 260 |
+
"""Simple 1D regression dataset: y = sin(x) on [0, 2π]."""
|
| 261 |
+
rng = np.random.default_rng()
|
| 262 |
+
x = rng.uniform(0.0, 2.0 * np.pi, size=(n_samples, 1))
|
| 263 |
+
y = np.sin(x)
|
| 264 |
+
return x, y
|
| 265 |
+
|
| 266 |
+
|
| 267 |
+
def demo_train_and_dream() -> None:
|
| 268 |
+
"""Train a STANNO on sin(x) and then explore noise/lesion effects.
|
| 269 |
+
|
| 270 |
+
Run this function directly ("python stanno_poc.py") to see numeric output.
|
| 271 |
+
"""
|
| 272 |
+
x, y = make_sin_dataset(512)
|
| 273 |
+
stanno = STANNO(input_dim=1, hidden_dim=32, output_dim=1, learning_rate=5e-3)
|
| 274 |
+
|
| 275 |
+
print("Training STANNO on y = sin(x)...")
|
| 276 |
+
stanno.fit(
|
| 277 |
+
x,
|
| 278 |
+
y,
|
| 279 |
+
epochs=500,
|
| 280 |
+
batch_size=64,
|
| 281 |
+
callback=lambda e, l: print(f"Epoch {e:4d} loss={l:.5f}") if (e + 1) % 100 == 0 else None,
|
| 282 |
+
)
|
| 283 |
+
|
| 284 |
+
# Evaluate basic fit
|
| 285 |
+
xs = np.linspace(0, 2 * np.pi, 16).reshape(-1, 1)
|
| 286 |
+
preds = stanno.predict(xs)
|
| 287 |
+
print("
|
| 288 |
+
Sample predictions after training:")
|
| 289 |
+
for xi, yi, yi_hat in zip(xs.flatten(), np.sin(xs).flatten(), preds.flatten()):
|
| 290 |
+
print(f"x={xi:5.2f} sin(x)={yi: .3f} pred={yi_hat: .3f}")
|
| 291 |
+
|
| 292 |
+
# Dreaming with different noise levels
|
| 293 |
+
for sigma in [0.0, 0.05, 0.2, 0.5]:
|
| 294 |
+
seq = stanno.dream(num_steps=32, noise_sigma=sigma, blind_inputs=True)
|
| 295 |
+
print(f"
|
| 296 |
+
Dreaming with noise_sigma={sigma} (first 10 outputs):")
|
| 297 |
+
print(np.round(seq[:10].flatten(), 3))
|
| 298 |
+
|
| 299 |
+
# Lesion experiment
|
| 300 |
+
print("
|
| 301 |
+
Lesioning 70% of weights and evaluating error on test points...")
|
| 302 |
+
# Backup parameters
|
| 303 |
+
backup = [p.copy() for p in stanno.net.parameters()]
|
| 304 |
+
stanno.net.lesion(fraction=0.7)
|
| 305 |
+
preds_lesioned = stanno.predict(xs)
|
| 306 |
+
mse_lesioned = float(np.mean((preds_lesioned - np.sin(xs)) ** 2))
|
| 307 |
+
print(f"MSE after lesioning 70% of weights: {mse_lesioned:.4f}")
|
| 308 |
+
# Restore
|
| 309 |
+
for param, b in zip(stanno.net.parameters(), backup):
|
| 310 |
+
param[...] = b
|
| 311 |
+
|
| 312 |
+
|
| 313 |
+
if __name__ == "__main__":
|
| 314 |
+
demo_train_and_dream()
|
stanno/__init__.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
STANNO — Self-Training Artificial Neural Network Objects.
|
| 3 |
+
|
| 4 |
+
Inspired by Stephen Thaler's STANNO architecture (US patent 5852815A).
|
| 5 |
+
A neural object in which one network (TrainerNet) directly updates the
|
| 6 |
+
weights of another (TraineeNet), with support for dream mode, noise
|
| 7 |
+
injection, and progressive lesioning.
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
from stanno.core.stanno import STANNO
|
| 11 |
+
from stanno.config.schema import STANNOConfig, DataConfig, LLMConfig, FilterConfig
|
| 12 |
+
from stanno.integration.cascade import CascadeSTANNO
|
| 13 |
+
from stanno.integration.dsanno import DSANNO, ScanResult
|
| 14 |
+
|
| 15 |
+
__all__ = [
|
| 16 |
+
"STANNO",
|
| 17 |
+
"STANNOConfig",
|
| 18 |
+
"DataConfig",
|
| 19 |
+
"LLMConfig",
|
| 20 |
+
"FilterConfig",
|
| 21 |
+
"CascadeSTANNO",
|
| 22 |
+
"DSANNO",
|
| 23 |
+
"ScanResult",
|
| 24 |
+
]
|
stanno/__main__.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from stanno.cli import main
|
| 2 |
+
main()
|
stanno/cli.py
ADDED
|
@@ -0,0 +1,398 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
STANNO command-line interface.
|
| 3 |
+
|
| 4 |
+
Usage:
|
| 5 |
+
stanno train --config examples/sin_regression.json
|
| 6 |
+
stanno predict --config examples/sin_regression.json --input 0.25
|
| 7 |
+
stanno dream --config examples/sin_regression.json
|
| 8 |
+
stanno evaluate --config examples/sin_regression.json
|
| 9 |
+
stanno filter --config examples/anomaly_filter.json [--file prompts.txt]
|
| 10 |
+
stanno cascade --config examples/cascade_autoencoder.json
|
| 11 |
+
stanno scan --config examples/scan_demo.json --data data.npy [--threshold 0.05] [--top-k 10]
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
from __future__ import annotations
|
| 15 |
+
import argparse
|
| 16 |
+
import json
|
| 17 |
+
import os
|
| 18 |
+
import sys
|
| 19 |
+
from pathlib import Path
|
| 20 |
+
from typing import Any, Dict, List, Optional
|
| 21 |
+
|
| 22 |
+
import numpy as np
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
# ─── helpers ─────────────────────────────────────────────────────────────────
|
| 26 |
+
|
| 27 |
+
def _load_config(path: str) -> Dict[str, Any]:
|
| 28 |
+
with open(path, "r", encoding="utf-8") as f:
|
| 29 |
+
return json.load(f)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def _build_stanno(cfg_dict: Dict[str, Any]):
|
| 33 |
+
"""Instantiate a STANNO from the 'stanno' section of a config dict."""
|
| 34 |
+
from stanno.config.schema import STANNOConfig
|
| 35 |
+
from stanno.core.stanno import STANNO
|
| 36 |
+
|
| 37 |
+
s = cfg_dict.get("stanno", {})
|
| 38 |
+
config = STANNOConfig(
|
| 39 |
+
layers=s.get("layers", [1, 32, 1]),
|
| 40 |
+
trainer_type=s.get("trainer_type", "fixed"),
|
| 41 |
+
backend=s.get("backend", "numpy"),
|
| 42 |
+
learning_rate=s.get("learning_rate", 0.01),
|
| 43 |
+
feedback_projection=s.get("feedback_projection", "repeat"),
|
| 44 |
+
trainer_kwargs=s.get("trainer_kwargs", {}),
|
| 45 |
+
)
|
| 46 |
+
return STANNO(config)
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def _load_data(cfg_dict: Dict[str, Any]):
|
| 50 |
+
"""Load data using the 'data' section of a config dict."""
|
| 51 |
+
from stanno.config.schema import DataConfig
|
| 52 |
+
from stanno.data.base import make_loader
|
| 53 |
+
|
| 54 |
+
d = cfg_dict.get("data", {})
|
| 55 |
+
data_config = DataConfig(
|
| 56 |
+
path=d.get("path", ""),
|
| 57 |
+
format=d.get("format", "builtin:sin"),
|
| 58 |
+
input_cols=d.get("input_cols"),
|
| 59 |
+
output_cols=d.get("output_cols"),
|
| 60 |
+
split_ratio=d.get("split_ratio", 0.8),
|
| 61 |
+
normalize=d.get("normalize", False),
|
| 62 |
+
n_samples=d.get("n_samples"),
|
| 63 |
+
)
|
| 64 |
+
return make_loader(data_config).load()
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
# ─── sub-commands ─────────────────────────────────────────────────────────────
|
| 68 |
+
|
| 69 |
+
def cmd_train(args: argparse.Namespace) -> None:
|
| 70 |
+
cfg = _load_config(args.config)
|
| 71 |
+
stanno = _build_stanno(cfg)
|
| 72 |
+
x, y = _load_data(cfg)
|
| 73 |
+
|
| 74 |
+
fit_cfg = cfg.get("fit", {})
|
| 75 |
+
epochs = fit_cfg.get("epochs", 1000)
|
| 76 |
+
batch_size = fit_cfg.get("batch_size", 32)
|
| 77 |
+
log_every = fit_cfg.get("log_every", max(1, epochs // 10))
|
| 78 |
+
patience = fit_cfg.get("patience", 20) # early stopping: stop if no improvement for N epochs
|
| 79 |
+
|
| 80 |
+
def callback(epoch: int, loss: float) -> None:
|
| 81 |
+
if (epoch + 1) % log_every == 0 or epoch == 0:
|
| 82 |
+
print(f" epoch {epoch+1:5d}/{epochs} loss={loss:.6f}")
|
| 83 |
+
|
| 84 |
+
print(f"Training {stanno} on {len(x)} samples …")
|
| 85 |
+
stanno.fit(x, y, epochs=epochs, batch_size=batch_size, callback=callback, patience=patience)
|
| 86 |
+
|
| 87 |
+
save_path = cfg.get("save_path") or args.config.replace(".json", ".stanno.pkl")
|
| 88 |
+
stanno.save(save_path)
|
| 89 |
+
print(f"Saved → {save_path}")
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def cmd_predict(args: argparse.Namespace) -> None:
|
| 93 |
+
cfg = _load_config(args.config)
|
| 94 |
+
load_path = cfg.get("save_path") or args.config.replace(".json", ".stanno.pkl")
|
| 95 |
+
from stanno.core.stanno import STANNO
|
| 96 |
+
stanno = STANNO.load(load_path)
|
| 97 |
+
x = np.array([[float(v) for v in args.input.split(",")]], dtype=np.float32)
|
| 98 |
+
y = stanno.predict(x)
|
| 99 |
+
print(f"input={x.ravel().tolist()} → output={y.ravel().tolist()}")
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
def cmd_dream(args: argparse.Namespace) -> None:
|
| 103 |
+
cfg = _load_config(args.config)
|
| 104 |
+
load_path = cfg.get("save_path") or args.config.replace(".json", ".stanno.pkl")
|
| 105 |
+
from stanno.core.stanno import STANNO
|
| 106 |
+
stanno = STANNO.load(load_path)
|
| 107 |
+
|
| 108 |
+
dream_cfgs = cfg.get("dream")
|
| 109 |
+
if dream_cfgs is None:
|
| 110 |
+
dream_cfgs = [{}]
|
| 111 |
+
if isinstance(dream_cfgs, dict):
|
| 112 |
+
dream_cfgs = [dream_cfgs]
|
| 113 |
+
|
| 114 |
+
for i, dcfg in enumerate(dream_cfgs):
|
| 115 |
+
seq = stanno.dream(
|
| 116 |
+
num_steps=dcfg.get("num_steps", 64),
|
| 117 |
+
noise_sigma=dcfg.get("noise_sigma", 0.1),
|
| 118 |
+
blind_inputs=dcfg.get("blind_inputs", False),
|
| 119 |
+
)
|
| 120 |
+
print(f"\n── dream {i} (noise={dcfg.get('noise_sigma', 0.1)}) ──")
|
| 121 |
+
for step, row in enumerate(seq):
|
| 122 |
+
print(f" step {step:3d}: {row.tolist()}")
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
def cmd_evaluate(args: argparse.Namespace) -> None:
|
| 126 |
+
cfg = _load_config(args.config)
|
| 127 |
+
load_path = cfg.get("save_path") or args.config.replace(".json", ".stanno.pkl")
|
| 128 |
+
from stanno.core.stanno import STANNO
|
| 129 |
+
stanno = STANNO.load(load_path)
|
| 130 |
+
x, y = _load_data(cfg)
|
| 131 |
+
y_pred = stanno.predict(x)
|
| 132 |
+
mse = float(np.mean((y_pred - y) ** 2))
|
| 133 |
+
mae = float(np.mean(np.abs(y_pred - y)))
|
| 134 |
+
print(f"MSE={mse:.6f} MAE={mae:.6f} N={len(x)}")
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
def cmd_filter(args: argparse.Namespace) -> None:
|
| 138 |
+
cfg = _load_config(args.config)
|
| 139 |
+
load_path = cfg.get("save_path") or args.config.replace(".json", ".stanno.pkl")
|
| 140 |
+
from stanno.core.stanno import STANNO
|
| 141 |
+
from stanno.config.schema import FilterConfig, LLMConfig
|
| 142 |
+
from stanno.integration.filter import STANNOFilter, FilteredRequestError
|
| 143 |
+
from stanno.integration.llm_client import LLMClient
|
| 144 |
+
|
| 145 |
+
stanno = STANNO.load(load_path)
|
| 146 |
+
|
| 147 |
+
fcfg = cfg.get("filter", {})
|
| 148 |
+
filter_config = FilterConfig(
|
| 149 |
+
anomaly_threshold=fcfg.get("anomaly_threshold", 0.7),
|
| 150 |
+
block_above_threshold=fcfg.get("block_above_threshold", True),
|
| 151 |
+
metadata_field=fcfg.get("metadata_field", "stanno_filter"),
|
| 152 |
+
)
|
| 153 |
+
|
| 154 |
+
lcfg = cfg.get("llm", {})
|
| 155 |
+
llm_config = LLMConfig(
|
| 156 |
+
base_url=lcfg.get("base_url", "http://localhost:11434"),
|
| 157 |
+
model=lcfg.get("model", "llama3.2:3b"),
|
| 158 |
+
api_key=lcfg.get("api_key"),
|
| 159 |
+
temperature=lcfg.get("temperature", 0.7),
|
| 160 |
+
max_tokens=lcfg.get("max_tokens", 512),
|
| 161 |
+
timeout_seconds=lcfg.get("timeout_seconds", 60),
|
| 162 |
+
)
|
| 163 |
+
llm = LLMClient(llm_config)
|
| 164 |
+
filt = STANNOFilter(stanno, filter_config, llm)
|
| 165 |
+
|
| 166 |
+
lines: List[str] = []
|
| 167 |
+
if args.file:
|
| 168 |
+
lines = Path(args.file).read_text(encoding="utf-8").splitlines()
|
| 169 |
+
else:
|
| 170 |
+
print("Interactive filter REPL (Ctrl-D to quit):")
|
| 171 |
+
try:
|
| 172 |
+
while True:
|
| 173 |
+
line = input("> ")
|
| 174 |
+
lines.append(line)
|
| 175 |
+
except EOFError:
|
| 176 |
+
pass
|
| 177 |
+
|
| 178 |
+
for line in lines:
|
| 179 |
+
if not line.strip():
|
| 180 |
+
continue
|
| 181 |
+
# Encode prompt as char-code vector (demo encoding)
|
| 182 |
+
x = np.array([ord(c) / 127.0 for c in line[:stanno.net.input_dim]],
|
| 183 |
+
dtype=np.float32)
|
| 184 |
+
if len(x) < stanno.net.input_dim:
|
| 185 |
+
x = np.pad(x, (0, stanno.net.input_dim - len(x)))
|
| 186 |
+
score, meta = filt.score(x.reshape(1, -1))
|
| 187 |
+
blocked = meta["blocked"]
|
| 188 |
+
status = "BLOCKED" if blocked else "PASSED"
|
| 189 |
+
print(f"[{status} score={score:.3f}] {line[:80]}")
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
def cmd_cascade(args: argparse.Namespace) -> None:
|
| 193 |
+
"""Train or evaluate a CascadeSTANNO from a config file.
|
| 194 |
+
|
| 195 |
+
Config format (JSON):
|
| 196 |
+
{
|
| 197 |
+
"stages": [
|
| 198 |
+
{"layers": [784, 128], "trainer_type": "fixed", "learning_rate": 0.01},
|
| 199 |
+
{"layers": [128, 784], "trainer_type": "fixed", "learning_rate": 0.01}
|
| 200 |
+
],
|
| 201 |
+
"frozen": [false, false], // optional, default all false
|
| 202 |
+
"data": { ... same as regular config ... },
|
| 203 |
+
"fit": {"epochs": 200, "batch_size": 64, "patience": 20, "log_every": 20},
|
| 204 |
+
"mode": "endtoend", // "endtoend" | "staged" (default: endtoend)
|
| 205 |
+
"save_path": "examples/cascade.stanno.pkl"
|
| 206 |
+
}
|
| 207 |
+
"""
|
| 208 |
+
from stanno.config.schema import STANNOConfig
|
| 209 |
+
from stanno.core.stanno import STANNO
|
| 210 |
+
from stanno.integration.cascade import CascadeSTANNO
|
| 211 |
+
|
| 212 |
+
cfg = _load_config(args.config)
|
| 213 |
+
stage_cfgs = cfg.get("stages", [])
|
| 214 |
+
if not stage_cfgs:
|
| 215 |
+
print("Error: config must have a non-empty 'stages' list.", file=sys.stderr)
|
| 216 |
+
sys.exit(1)
|
| 217 |
+
|
| 218 |
+
frozen = cfg.get("frozen", [False] * len(stage_cfgs))
|
| 219 |
+
stages = []
|
| 220 |
+
for sc in stage_cfgs:
|
| 221 |
+
scfg = STANNOConfig(
|
| 222 |
+
layers=sc.get("layers", [1, 32, 1]),
|
| 223 |
+
trainer_type=sc.get("trainer_type", "fixed"),
|
| 224 |
+
backend=sc.get("backend", "numpy"),
|
| 225 |
+
learning_rate=sc.get("learning_rate", 0.01),
|
| 226 |
+
feedback_projection=sc.get("feedback_projection", "repeat"),
|
| 227 |
+
trainer_kwargs=sc.get("trainer_kwargs", {}),
|
| 228 |
+
)
|
| 229 |
+
stages.append(STANNO(scfg))
|
| 230 |
+
|
| 231 |
+
cascade = CascadeSTANNO(stages, frozen=frozen)
|
| 232 |
+
print(cascade)
|
| 233 |
+
|
| 234 |
+
x, y = _load_data(cfg)
|
| 235 |
+
fit_cfg = cfg.get("fit", {})
|
| 236 |
+
epochs = fit_cfg.get("epochs", 100)
|
| 237 |
+
batch_size = fit_cfg.get("batch_size", 32)
|
| 238 |
+
patience = fit_cfg.get("patience", 20)
|
| 239 |
+
log_every = fit_cfg.get("log_every", max(1, epochs // 10))
|
| 240 |
+
mode = cfg.get("mode", "endtoend")
|
| 241 |
+
|
| 242 |
+
print(f"Training cascade ({mode} mode) on {len(x)} samples …")
|
| 243 |
+
|
| 244 |
+
if mode == "staged":
|
| 245 |
+
# For staged mode, targets must be provided per stage — use the same y
|
| 246 |
+
# for all stages (user should provide intermediate_targets in code).
|
| 247 |
+
histories = cascade.staged_fit(
|
| 248 |
+
x,
|
| 249 |
+
intermediate_targets=[y] * len(stages),
|
| 250 |
+
epochs=epochs,
|
| 251 |
+
batch_size=batch_size,
|
| 252 |
+
patience=patience,
|
| 253 |
+
log_every=log_every,
|
| 254 |
+
)
|
| 255 |
+
for k, h in enumerate(histories):
|
| 256 |
+
if h:
|
| 257 |
+
print(f" Stage {k} final loss: {h[-1]:.6f}")
|
| 258 |
+
else:
|
| 259 |
+
history = cascade.fit(
|
| 260 |
+
x, y,
|
| 261 |
+
epochs=epochs,
|
| 262 |
+
batch_size=batch_size,
|
| 263 |
+
patience=patience,
|
| 264 |
+
log_every=log_every,
|
| 265 |
+
)
|
| 266 |
+
if history:
|
| 267 |
+
print(f" Final loss: {history[-1]:.6f}")
|
| 268 |
+
|
| 269 |
+
save_path = cfg.get("save_path") or args.config.replace(".json", ".cascade.pkl")
|
| 270 |
+
cascade.save(save_path)
|
| 271 |
+
print(f"Saved → {save_path}")
|
| 272 |
+
|
| 273 |
+
|
| 274 |
+
def cmd_scan(args: argparse.Namespace) -> None:
|
| 275 |
+
"""Scan a dataset with a trained STANNO (DSANNO mode).
|
| 276 |
+
|
| 277 |
+
Requires a trained model at --model (or loads from config save_path).
|
| 278 |
+
Scans --data (npy/npz file) and prints top-k matches or threshold-filtered results.
|
| 279 |
+
"""
|
| 280 |
+
from stanno.core.stanno import STANNO
|
| 281 |
+
from stanno.integration.dsanno import DSANNO
|
| 282 |
+
|
| 283 |
+
# Load STANNO
|
| 284 |
+
model_path = getattr(args, "model", None)
|
| 285 |
+
if model_path is None and args.config:
|
| 286 |
+
cfg = _load_config(args.config)
|
| 287 |
+
model_path = cfg.get("save_path")
|
| 288 |
+
if not model_path or not Path(model_path).exists():
|
| 289 |
+
print(f"Error: model not found at {model_path!r}. Train first.", file=sys.stderr)
|
| 290 |
+
sys.exit(1)
|
| 291 |
+
|
| 292 |
+
stanno = STANNO.load(model_path)
|
| 293 |
+
mode = getattr(args, "mode", "reconstruction")
|
| 294 |
+
dsanno = DSANNO(stanno, mode=mode)
|
| 295 |
+
|
| 296 |
+
# Load data
|
| 297 |
+
data_path = getattr(args, "data", None)
|
| 298 |
+
if not data_path or not Path(data_path).exists():
|
| 299 |
+
print(f"Error: --data file not found: {data_path!r}", file=sys.stderr)
|
| 300 |
+
sys.exit(1)
|
| 301 |
+
|
| 302 |
+
x = np.load(data_path)
|
| 303 |
+
if isinstance(x, np.lib.npyio.NpzFile):
|
| 304 |
+
x = x["x"] # convention: 'x' key
|
| 305 |
+
x = np.asarray(x, dtype=np.float32)
|
| 306 |
+
print(f"Scanning {x.shape[0]} rows with {dsanno} …")
|
| 307 |
+
|
| 308 |
+
top_k = getattr(args, "top_k", None)
|
| 309 |
+
threshold = getattr(args, "threshold", None)
|
| 310 |
+
|
| 311 |
+
if top_k:
|
| 312 |
+
indices, scores, preds = dsanno.top_k(x, k=top_k)
|
| 313 |
+
print(f"\nTop {top_k} matches (lowest reconstruction error):")
|
| 314 |
+
for rank, (idx, sc) in enumerate(zip(indices, scores)):
|
| 315 |
+
print(f" #{rank+1:3d} row={idx:6d} score={sc:.6f} x={x[idx].tolist()}")
|
| 316 |
+
elif threshold is not None:
|
| 317 |
+
result = dsanno.scan(x, threshold=float(threshold))
|
| 318 |
+
print(result)
|
| 319 |
+
matched = result.matched_indices()
|
| 320 |
+
print(f"\n{len(matched)} rows matched (score ≤ {threshold}):")
|
| 321 |
+
for idx in matched[:50]: # show at most 50
|
| 322 |
+
print(f" row={idx:6d} score={result.scores[idx]:.6f}")
|
| 323 |
+
if len(matched) > 50:
|
| 324 |
+
print(f" … and {len(matched) - 50} more")
|
| 325 |
+
else:
|
| 326 |
+
# Just print summary statistics
|
| 327 |
+
result = dsanno.scan(x)
|
| 328 |
+
print(result)
|
| 329 |
+
p5, p50, p95 = np.percentile(result.scores, [5, 50, 95])
|
| 330 |
+
print(f" score p5={p5:.4f} p50={p50:.4f} p95={p95:.4f}")
|
| 331 |
+
suggested = dsanno.calibrate_threshold(x)
|
| 332 |
+
print(f" Suggested threshold (p95 of this data): {suggested:.4f}")
|
| 333 |
+
|
| 334 |
+
|
| 335 |
+
# ─── entry point ──────────────────────────────────────────────────────────────
|
| 336 |
+
|
| 337 |
+
def main(argv: Optional[List[str]] = None) -> None:
|
| 338 |
+
parser = argparse.ArgumentParser(
|
| 339 |
+
prog="stanno",
|
| 340 |
+
description="STANNO — Self-Training Artificial Neural Network Object CLI",
|
| 341 |
+
)
|
| 342 |
+
parser.add_argument("--version", action="version", version="stanno 0.1.0")
|
| 343 |
+
sub = parser.add_subparsers(dest="command", required=True)
|
| 344 |
+
|
| 345 |
+
# train
|
| 346 |
+
p_train = sub.add_parser("train", help="Train a STANNO from a config file")
|
| 347 |
+
p_train.add_argument("--config", required=True, help="Path to JSON config")
|
| 348 |
+
|
| 349 |
+
# predict
|
| 350 |
+
p_pred = sub.add_parser("predict", help="Run a single prediction")
|
| 351 |
+
p_pred.add_argument("--config", required=True, help="Path to JSON config")
|
| 352 |
+
p_pred.add_argument("--input", required=True, help="Comma-separated input values")
|
| 353 |
+
|
| 354 |
+
# dream
|
| 355 |
+
p_dream = sub.add_parser("dream", help="Run dream (generative) mode")
|
| 356 |
+
p_dream.add_argument("--config", required=True, help="Path to JSON config")
|
| 357 |
+
|
| 358 |
+
# evaluate
|
| 359 |
+
p_eval = sub.add_parser("evaluate", help="Evaluate on held-out data")
|
| 360 |
+
p_eval.add_argument("--config", required=True, help="Path to JSON config")
|
| 361 |
+
|
| 362 |
+
# filter
|
| 363 |
+
p_filt = sub.add_parser("filter", help="Run anomaly filter on prompts")
|
| 364 |
+
p_filt.add_argument("--config", required=True, help="Path to JSON config")
|
| 365 |
+
p_filt.add_argument("--file", default=None, help="File of prompts (one per line)")
|
| 366 |
+
|
| 367 |
+
# cascade
|
| 368 |
+
p_casc = sub.add_parser("cascade", help="Train a CascadeSTANNO")
|
| 369 |
+
p_casc.add_argument("--config", required=True, help="Path to cascade JSON config")
|
| 370 |
+
|
| 371 |
+
# scan
|
| 372 |
+
p_scan = sub.add_parser("scan", help="Scan a dataset with a trained STANNO (DSANNO)")
|
| 373 |
+
p_scan.add_argument("--config", default=None, help="Config with save_path")
|
| 374 |
+
p_scan.add_argument("--model", default=None, help="Explicit path to .stanno.pkl model")
|
| 375 |
+
p_scan.add_argument("--data", required=True, help="Path to .npy / .npz file to scan")
|
| 376 |
+
p_scan.add_argument("--threshold", type=float, default=None, help="Match score threshold")
|
| 377 |
+
p_scan.add_argument("--top-k", type=int, default=None, dest="top_k",
|
| 378 |
+
help="Return k best-matching rows")
|
| 379 |
+
p_scan.add_argument("--mode", default="reconstruction",
|
| 380 |
+
choices=["reconstruction", "prediction"],
|
| 381 |
+
help="Scoring mode (default: reconstruction)")
|
| 382 |
+
|
| 383 |
+
args = parser.parse_args(argv)
|
| 384 |
+
|
| 385 |
+
dispatch = {
|
| 386 |
+
"train": cmd_train,
|
| 387 |
+
"predict": cmd_predict,
|
| 388 |
+
"dream": cmd_dream,
|
| 389 |
+
"evaluate": cmd_evaluate,
|
| 390 |
+
"filter": cmd_filter,
|
| 391 |
+
"cascade": cmd_cascade,
|
| 392 |
+
"scan": cmd_scan,
|
| 393 |
+
}
|
| 394 |
+
dispatch[args.command](args)
|
| 395 |
+
|
| 396 |
+
|
| 397 |
+
if __name__ == "__main__":
|
| 398 |
+
main()
|
stanno/config/__init__.py
ADDED
|
File without changes
|
stanno/config/schema.py
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Configuration dataclasses for the STANNO system.
|
| 3 |
+
|
| 4 |
+
All configuration is expressed as plain Python dataclasses so no extra
|
| 5 |
+
dependencies are needed. JSON config files are mapped to these objects
|
| 6 |
+
by the CLI and any other entry points.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
from __future__ import annotations
|
| 10 |
+
from dataclasses import dataclass, field
|
| 11 |
+
from typing import Any, Dict, List, Optional
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
@dataclass
|
| 15 |
+
class STANNOConfig:
|
| 16 |
+
"""
|
| 17 |
+
Complete architecture + training configuration for a STANNO object.
|
| 18 |
+
|
| 19 |
+
layers:
|
| 20 |
+
List of layer sizes including input and output dimensions.
|
| 21 |
+
Examples:
|
| 22 |
+
[1, 32, 1] — 1-D regression (matches poc demo)
|
| 23 |
+
[784, 256, 128, 10] — MNIST-scale classifier
|
| 24 |
+
[768, 256, 768] — CLIP-embedding autoencoder (SD 1.5)
|
| 25 |
+
[512, 256, 256, 512] — deep autoencoder
|
| 26 |
+
|
| 27 |
+
trainer_type:
|
| 28 |
+
Which TrainerNet to use:
|
| 29 |
+
"fixed" — 4-module patent-faithful implementation (default)
|
| 30 |
+
"local_rule" — per-synapse learned update rule
|
| 31 |
+
"evolutionary"— ES-based, no autodiff required
|
| 32 |
+
|
| 33 |
+
backend:
|
| 34 |
+
Numerical backend ("numpy" | "torch"). Currently "numpy" is fully
|
| 35 |
+
implemented; "torch" requires PyTorch and enables meta-training.
|
| 36 |
+
|
| 37 |
+
learning_rate:
|
| 38 |
+
Base learning rate used by the active trainer.
|
| 39 |
+
|
| 40 |
+
feedback_projection:
|
| 41 |
+
How dream() feeds output back as the next input when
|
| 42 |
+
output_dim != input_dim:
|
| 43 |
+
"repeat" — tile and truncate (default, preserves poc behaviour)
|
| 44 |
+
"linear" — fixed random linear projection, initialized once
|
| 45 |
+
"zeros" — zero-pad or truncate (no information fed back)
|
| 46 |
+
|
| 47 |
+
trainer_kwargs:
|
| 48 |
+
Optional extra keyword arguments forwarded to the TrainerNet
|
| 49 |
+
constructor (e.g. {"hidden_dim": 32} for LocalRuleTrainerNet).
|
| 50 |
+
"""
|
| 51 |
+
|
| 52 |
+
layers: List[int] = field(default_factory=lambda: [1, 32, 1])
|
| 53 |
+
trainer_type: str = "fixed"
|
| 54 |
+
backend: str = "numpy"
|
| 55 |
+
learning_rate: float = 0.01
|
| 56 |
+
feedback_projection: str = "repeat"
|
| 57 |
+
trainer_kwargs: Dict[str, Any] = field(default_factory=dict)
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
@dataclass
|
| 61 |
+
class DataConfig:
|
| 62 |
+
"""
|
| 63 |
+
Data source configuration.
|
| 64 |
+
|
| 65 |
+
format:
|
| 66 |
+
"csv" — CSV or TSV flat file (requires pandas)
|
| 67 |
+
"json" — JSON or JSONL file
|
| 68 |
+
"numpy" — .npy or .npz file with pre-split arrays
|
| 69 |
+
"builtin:sin" — built-in sin(x) generator (no file needed)
|
| 70 |
+
|
| 71 |
+
input_cols / output_cols:
|
| 72 |
+
Column names (CSV/JSON) or indices to use as input / output.
|
| 73 |
+
If None, the loader uses its default split strategy.
|
| 74 |
+
|
| 75 |
+
input_dim / output_dim:
|
| 76 |
+
Override the inferred input/output dimensionality.
|
| 77 |
+
|
| 78 |
+
split_ratio:
|
| 79 |
+
Fraction of data to use for training (rest is test/validation).
|
| 80 |
+
|
| 81 |
+
normalize:
|
| 82 |
+
Whether to apply per-feature standardisation (zero mean, unit std).
|
| 83 |
+
|
| 84 |
+
n_samples:
|
| 85 |
+
Number of samples for built-in generators.
|
| 86 |
+
"""
|
| 87 |
+
|
| 88 |
+
path: str = ""
|
| 89 |
+
format: str = "csv"
|
| 90 |
+
input_cols: Optional[List[str]] = None
|
| 91 |
+
output_cols: Optional[List[str]] = None
|
| 92 |
+
input_dim: Optional[int] = None
|
| 93 |
+
output_dim: Optional[int] = None
|
| 94 |
+
split_ratio: float = 0.8
|
| 95 |
+
normalize: bool = False
|
| 96 |
+
n_samples: int = 512
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
@dataclass
|
| 100 |
+
class LLMConfig:
|
| 101 |
+
"""OpenAI-compatible LLM endpoint configuration.
|
| 102 |
+
|
| 103 |
+
Credentials are read from environment variables at runtime; the fields
|
| 104 |
+
here serve as fallbacks. Priority: env var > field value.
|
| 105 |
+
|
| 106 |
+
STANNO_LLM_BASE_URL — overrides base_url
|
| 107 |
+
STANNO_LLM_API_KEY — overrides api_key
|
| 108 |
+
STANNO_LLM_MODEL — overrides model
|
| 109 |
+
"""
|
| 110 |
+
|
| 111 |
+
base_url: str = "http://localhost:11434"
|
| 112 |
+
model: str = "mistral"
|
| 113 |
+
api_key: str = ""
|
| 114 |
+
temperature: float = 0.7
|
| 115 |
+
max_tokens: int = 200
|
| 116 |
+
timeout_seconds: int = 30
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
@dataclass
|
| 120 |
+
class FilterConfig:
|
| 121 |
+
"""Configuration for the STANNOFilter anomaly-scoring layer."""
|
| 122 |
+
|
| 123 |
+
anomaly_threshold: float = 0.15
|
| 124 |
+
block_above_threshold: bool = False
|
| 125 |
+
metadata_field: str = "stanno_score"
|
stanno/core/__init__.py
ADDED
|
File without changes
|
stanno/core/backend.py
ADDED
|
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Backend abstraction layer.
|
| 3 |
+
|
| 4 |
+
Provides a BackendProtocol (structural typing) and a NumPyBackend concrete
|
| 5 |
+
implementation. A TorchBackend stub is included for future use; it will be
|
| 6 |
+
wired in when meta-training via PyTorch is needed.
|
| 7 |
+
|
| 8 |
+
All TraineeNet and TrainerNet operations go through the backend so the same
|
| 9 |
+
algorithm can run on NumPy arrays or PyTorch tensors without changing the
|
| 10 |
+
core logic.
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
from __future__ import annotations
|
| 14 |
+
from typing import Optional, Protocol, Tuple, runtime_checkable
|
| 15 |
+
|
| 16 |
+
import numpy as np
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
# ─── Protocol ────────────────────────────────────────────────────────────────
|
| 20 |
+
|
| 21 |
+
@runtime_checkable
|
| 22 |
+
class BackendProtocol(Protocol):
|
| 23 |
+
"""Minimal set of array operations required by STANNO internals."""
|
| 24 |
+
|
| 25 |
+
def zeros(self, shape: Tuple[int, ...]) -> np.ndarray: ...
|
| 26 |
+
def ones(self, shape: Tuple[int, ...]) -> np.ndarray: ...
|
| 27 |
+
def normal(
|
| 28 |
+
self,
|
| 29 |
+
mean: float,
|
| 30 |
+
std: float,
|
| 31 |
+
shape: Tuple[int, ...],
|
| 32 |
+
rng: Optional[np.random.Generator] = None,
|
| 33 |
+
) -> np.ndarray: ...
|
| 34 |
+
def tanh(self, x: np.ndarray) -> np.ndarray: ...
|
| 35 |
+
def matmul(self, a: np.ndarray, b: np.ndarray) -> np.ndarray: ...
|
| 36 |
+
def sum(
|
| 37 |
+
self, x: np.ndarray, axis=None, keepdims: bool = False
|
| 38 |
+
) -> np.ndarray: ...
|
| 39 |
+
def mean(self, x: np.ndarray, axis=None) -> np.ndarray: ...
|
| 40 |
+
def sqrt(self, x: np.ndarray) -> np.ndarray: ...
|
| 41 |
+
def clip(self, x: np.ndarray, min_val: float, max_val: float) -> np.ndarray: ...
|
| 42 |
+
def copy(self, x: np.ndarray) -> np.ndarray: ...
|
| 43 |
+
def to_numpy(self, x) -> np.ndarray: ...
|
| 44 |
+
def from_numpy(self, arr: np.ndarray) -> np.ndarray: ...
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
# ─── NumPy backend ───────────────────────────────────────────────────────────
|
| 48 |
+
|
| 49 |
+
class NumPyBackend:
|
| 50 |
+
"""Default backend using NumPy. No extra dependencies required."""
|
| 51 |
+
|
| 52 |
+
def zeros(self, shape):
|
| 53 |
+
return np.zeros(shape, dtype=np.float32)
|
| 54 |
+
|
| 55 |
+
def ones(self, shape):
|
| 56 |
+
return np.ones(shape, dtype=np.float32)
|
| 57 |
+
|
| 58 |
+
def normal(self, mean, std, shape, rng=None):
|
| 59 |
+
if rng is None:
|
| 60 |
+
rng = np.random.default_rng()
|
| 61 |
+
return rng.normal(mean, std, shape).astype(np.float32)
|
| 62 |
+
|
| 63 |
+
def tanh(self, x):
|
| 64 |
+
return np.tanh(x)
|
| 65 |
+
|
| 66 |
+
def matmul(self, a, b):
|
| 67 |
+
return a @ b
|
| 68 |
+
|
| 69 |
+
def sum(self, x, axis=None, keepdims=False):
|
| 70 |
+
return np.sum(x, axis=axis, keepdims=keepdims)
|
| 71 |
+
|
| 72 |
+
def mean(self, x, axis=None):
|
| 73 |
+
return np.mean(x, axis=axis)
|
| 74 |
+
|
| 75 |
+
def sqrt(self, x):
|
| 76 |
+
return np.sqrt(x)
|
| 77 |
+
|
| 78 |
+
def clip(self, x, min_val, max_val):
|
| 79 |
+
return np.clip(x, min_val, max_val)
|
| 80 |
+
|
| 81 |
+
def copy(self, x):
|
| 82 |
+
return x.copy()
|
| 83 |
+
|
| 84 |
+
def to_numpy(self, x):
|
| 85 |
+
return np.asarray(x, dtype=np.float32)
|
| 86 |
+
|
| 87 |
+
def from_numpy(self, arr):
|
| 88 |
+
return np.asarray(arr, dtype=np.float32)
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
# ─── Torch backend stub ──────────────────────────────────────────────────────
|
| 92 |
+
|
| 93 |
+
class TorchBackend:
|
| 94 |
+
"""
|
| 95 |
+
PyTorch backend stub — available when torch is installed.
|
| 96 |
+
|
| 97 |
+
Used for meta-training LocalRuleTrainerNet via unrolled gradient descent.
|
| 98 |
+
Falls back gracefully: if PyTorch is not importable, raise a clear error
|
| 99 |
+
at construction time rather than deep inside training.
|
| 100 |
+
"""
|
| 101 |
+
|
| 102 |
+
def __init__(self):
|
| 103 |
+
try:
|
| 104 |
+
import torch
|
| 105 |
+
self._torch = torch
|
| 106 |
+
except ImportError as exc:
|
| 107 |
+
raise ImportError(
|
| 108 |
+
"TorchBackend requires PyTorch. Install it with:\n"
|
| 109 |
+
" pip install torch\n"
|
| 110 |
+
"or use the NumPy backend instead."
|
| 111 |
+
) from exc
|
| 112 |
+
|
| 113 |
+
def zeros(self, shape):
|
| 114 |
+
return self._torch.zeros(shape, dtype=self._torch.float32)
|
| 115 |
+
|
| 116 |
+
def ones(self, shape):
|
| 117 |
+
return self._torch.ones(shape, dtype=self._torch.float32)
|
| 118 |
+
|
| 119 |
+
def normal(self, mean, std, shape, rng=None):
|
| 120 |
+
t = self._torch.zeros(shape, dtype=self._torch.float32)
|
| 121 |
+
return t.normal_(mean, std)
|
| 122 |
+
|
| 123 |
+
def tanh(self, x):
|
| 124 |
+
return self._torch.tanh(x)
|
| 125 |
+
|
| 126 |
+
def matmul(self, a, b):
|
| 127 |
+
return a @ b
|
| 128 |
+
|
| 129 |
+
def sum(self, x, axis=None, keepdims=False):
|
| 130 |
+
if axis is None:
|
| 131 |
+
return x.sum()
|
| 132 |
+
return x.sum(dim=axis, keepdim=keepdims)
|
| 133 |
+
|
| 134 |
+
def mean(self, x, axis=None):
|
| 135 |
+
if axis is None:
|
| 136 |
+
return x.mean()
|
| 137 |
+
return x.mean(dim=axis)
|
| 138 |
+
|
| 139 |
+
def sqrt(self, x):
|
| 140 |
+
return self._torch.sqrt(x)
|
| 141 |
+
|
| 142 |
+
def clip(self, x, min_val, max_val):
|
| 143 |
+
return self._torch.clamp(x, min_val, max_val)
|
| 144 |
+
|
| 145 |
+
def copy(self, x):
|
| 146 |
+
return x.clone()
|
| 147 |
+
|
| 148 |
+
def to_numpy(self, x):
|
| 149 |
+
return x.detach().cpu().numpy()
|
| 150 |
+
|
| 151 |
+
def from_numpy(self, arr):
|
| 152 |
+
return self._torch.from_numpy(np.asarray(arr, dtype=np.float32))
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
# ─── factory ─────────────────────────────────────────────────────────────────
|
| 156 |
+
|
| 157 |
+
def make_backend(name: str) -> BackendProtocol:
|
| 158 |
+
if name == "numpy":
|
| 159 |
+
return NumPyBackend()
|
| 160 |
+
if name == "torch":
|
| 161 |
+
return TorchBackend()
|
| 162 |
+
raise ValueError(f"Unknown backend: {name!r}. Choose 'numpy' or 'torch'.")
|
stanno/core/stanno.py
ADDED
|
@@ -0,0 +1,317 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
STANNO — Self-Training Artificial Neural Network Object.
|
| 3 |
+
|
| 4 |
+
Orchestrates the TraineeNet + AbstractTrainerNet pair.
|
| 5 |
+
|
| 6 |
+
fit() drives the training loop:
|
| 7 |
+
1. forward pass on TraineeNet → y_pred + TraineeState
|
| 8 |
+
2. inject y_batch and loss history into state
|
| 9 |
+
3. call trainer.compute_updates(state) → (ΔW, Δb)
|
| 10 |
+
4. apply_updates() directly on TraineeNet weights
|
| 11 |
+
|
| 12 |
+
No explicit backpropagation code lives here — the trainer handles all of
|
| 13 |
+
that logic (or learns to handle it, in the case of LocalRuleTrainerNet).
|
| 14 |
+
|
| 15 |
+
dream() generates sequences using a shadow copy of the trained net with
|
| 16 |
+
optional weight noise and optional input blinding, implementing Thaler's
|
| 17 |
+
"creative perturbation of internal state" concept.
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
from __future__ import annotations
|
| 21 |
+
import copy
|
| 22 |
+
import pickle
|
| 23 |
+
from typing import Callable, List, Optional, Tuple
|
| 24 |
+
|
| 25 |
+
import numpy as np
|
| 26 |
+
|
| 27 |
+
from stanno.config.schema import STANNOConfig
|
| 28 |
+
from stanno.core.backend import make_backend
|
| 29 |
+
from stanno.core.trainee import TraineeNet
|
| 30 |
+
from stanno.core.trainer import AbstractTrainerNet
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
# ─── trainer factory ─────────────────────────────────────────────────────────
|
| 34 |
+
|
| 35 |
+
def _make_trainer(config: STANNOConfig) -> AbstractTrainerNet:
|
| 36 |
+
from stanno.trainers.fixed import FixedTrainerNet
|
| 37 |
+
from stanno.trainers.local_rule import LocalRuleTrainerNet
|
| 38 |
+
from stanno.trainers.evolutionary import EvolutionaryTrainerNet
|
| 39 |
+
|
| 40 |
+
kwargs = config.trainer_kwargs
|
| 41 |
+
lr = config.learning_rate
|
| 42 |
+
|
| 43 |
+
if config.trainer_type == "fixed":
|
| 44 |
+
return FixedTrainerNet(learning_rate=lr, **kwargs)
|
| 45 |
+
if config.trainer_type == "local_rule":
|
| 46 |
+
return LocalRuleTrainerNet(learning_rate=lr, **kwargs)
|
| 47 |
+
if config.trainer_type == "evolutionary":
|
| 48 |
+
return EvolutionaryTrainerNet(learning_rate=lr, **kwargs)
|
| 49 |
+
raise ValueError(
|
| 50 |
+
f"Unknown trainer_type {config.trainer_type!r}. "
|
| 51 |
+
"Choose 'fixed', 'local_rule', or 'evolutionary'."
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
# ─── STANNO ──────────────────────────────────────────────────────────────────
|
| 56 |
+
|
| 57 |
+
class STANNO:
|
| 58 |
+
"""
|
| 59 |
+
Self-Training Artificial Neural Network Object.
|
| 60 |
+
|
| 61 |
+
Parameters
|
| 62 |
+
----------
|
| 63 |
+
config : STANNOConfig
|
| 64 |
+
Full architecture and training configuration.
|
| 65 |
+
rng : np.random.Generator, optional
|
| 66 |
+
Seeded RNG for reproducibility.
|
| 67 |
+
"""
|
| 68 |
+
|
| 69 |
+
def __init__(
|
| 70 |
+
self,
|
| 71 |
+
config: STANNOConfig,
|
| 72 |
+
rng: Optional[np.random.Generator] = None,
|
| 73 |
+
) -> None:
|
| 74 |
+
self.config = config
|
| 75 |
+
self._rng = rng or np.random.default_rng()
|
| 76 |
+
self.backend = make_backend(config.backend)
|
| 77 |
+
self.net = TraineeNet(config.layers, backend=self.backend, rng=self._rng)
|
| 78 |
+
self.trainer: AbstractTrainerNet = _make_trainer(config)
|
| 79 |
+
self._loss_history: List[float] = []
|
| 80 |
+
self._train_mse_norm: Optional[float] = None # used by filter for score normalisation
|
| 81 |
+
|
| 82 |
+
# Optional fixed projection layer for dream feedback when output_dim != input_dim
|
| 83 |
+
self._projection: Optional[np.ndarray] = None
|
| 84 |
+
if (
|
| 85 |
+
config.feedback_projection == "linear"
|
| 86 |
+
and config.layers[-1] != config.layers[0]
|
| 87 |
+
):
|
| 88 |
+
self._projection = self._rng.normal(
|
| 89 |
+
0.0, 0.1, (config.layers[-1], config.layers[0])
|
| 90 |
+
).astype(np.float32)
|
| 91 |
+
|
| 92 |
+
# ── training ─────────────────────────────────────────────────────────────
|
| 93 |
+
|
| 94 |
+
def fit(
|
| 95 |
+
self,
|
| 96 |
+
x: np.ndarray,
|
| 97 |
+
y: np.ndarray,
|
| 98 |
+
epochs: int = 1000,
|
| 99 |
+
batch_size: int = 32,
|
| 100 |
+
shuffle: bool = True,
|
| 101 |
+
callback: Optional[Callable[[int, float], None]] = None,
|
| 102 |
+
divergence_threshold: float = 100.0,
|
| 103 |
+
patience: int = 20,
|
| 104 |
+
) -> None:
|
| 105 |
+
"""
|
| 106 |
+
Train the STANNO using its internal TrainerNet.
|
| 107 |
+
|
| 108 |
+
Parameters
|
| 109 |
+
----------
|
| 110 |
+
x, y : ndarray
|
| 111 |
+
Training data of shape (N, input_dim) and (N, output_dim).
|
| 112 |
+
epochs : int
|
| 113 |
+
Number of full passes over the dataset.
|
| 114 |
+
batch_size : int
|
| 115 |
+
Mini-batch size.
|
| 116 |
+
shuffle : bool
|
| 117 |
+
Shuffle data each epoch.
|
| 118 |
+
callback : callable(epoch, mean_loss), optional
|
| 119 |
+
Called after each epoch for logging.
|
| 120 |
+
divergence_threshold : float
|
| 121 |
+
If any loss exceeds this, training is halted immediately (divergence detected).
|
| 122 |
+
patience : int
|
| 123 |
+
Number of epochs without improvement before early stopping (0 = disabled).
|
| 124 |
+
"""
|
| 125 |
+
x = np.asarray(x, dtype=np.float32)
|
| 126 |
+
y = np.asarray(y, dtype=np.float32)
|
| 127 |
+
N = x.shape[0]
|
| 128 |
+
|
| 129 |
+
best_loss = float("inf")
|
| 130 |
+
patience_counter = 0
|
| 131 |
+
|
| 132 |
+
for epoch in range(epochs):
|
| 133 |
+
idx = np.arange(N)
|
| 134 |
+
if shuffle:
|
| 135 |
+
self._rng.shuffle(idx)
|
| 136 |
+
x_shuf, y_shuf = x[idx], y[idx]
|
| 137 |
+
|
| 138 |
+
epoch_losses: List[float] = []
|
| 139 |
+
for start in range(0, N, batch_size):
|
| 140 |
+
xb = x_shuf[start: start + batch_size]
|
| 141 |
+
yb = y_shuf[start: start + batch_size]
|
| 142 |
+
loss = self._trainer_step(xb, yb)
|
| 143 |
+
epoch_losses.append(loss)
|
| 144 |
+
|
| 145 |
+
mean_loss = float(np.mean(epoch_losses))
|
| 146 |
+
self._loss_history.append(mean_loss)
|
| 147 |
+
if len(self._loss_history) > 100:
|
| 148 |
+
self._loss_history.pop(0)
|
| 149 |
+
|
| 150 |
+
# Divergence detection
|
| 151 |
+
if mean_loss > divergence_threshold:
|
| 152 |
+
print(
|
| 153 |
+
f" ⚠ DIVERGENCE DETECTED at epoch {epoch}: "
|
| 154 |
+
f"loss={mean_loss:.6f} exceeds threshold {divergence_threshold}. "
|
| 155 |
+
f"Halting training. Consider: reducing learning_rate, using fewer epochs, "
|
| 156 |
+
f"or checking data normalization."
|
| 157 |
+
)
|
| 158 |
+
break
|
| 159 |
+
|
| 160 |
+
# Early stopping (patience-based)
|
| 161 |
+
if patience > 0:
|
| 162 |
+
if mean_loss < best_loss:
|
| 163 |
+
best_loss = mean_loss
|
| 164 |
+
patience_counter = 0
|
| 165 |
+
else:
|
| 166 |
+
patience_counter += 1
|
| 167 |
+
if patience_counter >= patience:
|
| 168 |
+
print(
|
| 169 |
+
f" ⓘ Early stopping at epoch {epoch}: "
|
| 170 |
+
f"no improvement for {patience} epochs. "
|
| 171 |
+
f"Best loss: {best_loss:.6f}"
|
| 172 |
+
)
|
| 173 |
+
break
|
| 174 |
+
|
| 175 |
+
if callback is not None:
|
| 176 |
+
callback(epoch, mean_loss)
|
| 177 |
+
|
| 178 |
+
# Record mean training MSE for normalisation in STANNOFilter
|
| 179 |
+
if self._loss_history:
|
| 180 |
+
self._train_mse_norm = float(np.percentile(self._loss_history[-min(10, len(self._loss_history)):], 95)) or 1.0
|
| 181 |
+
|
| 182 |
+
def _trainer_step(self, x: np.ndarray, y: np.ndarray) -> float:
|
| 183 |
+
"""One mini-batch update: forward → state → trainer → apply deltas."""
|
| 184 |
+
y_pred, state = self.net.forward(x)
|
| 185 |
+
state.y_batch = y
|
| 186 |
+
state.y_pred = y_pred
|
| 187 |
+
state.loss_history = self._loss_history.copy()
|
| 188 |
+
|
| 189 |
+
loss = float(np.mean((y_pred - y) ** 2))
|
| 190 |
+
|
| 191 |
+
weight_deltas, bias_deltas = self.trainer.compute_updates(state)
|
| 192 |
+
self.net.apply_updates(weight_deltas, bias_deltas)
|
| 193 |
+
|
| 194 |
+
return loss
|
| 195 |
+
|
| 196 |
+
# ── inference ────────────────────────────────────────────────────────────
|
| 197 |
+
|
| 198 |
+
def predict(self, x: np.ndarray) -> np.ndarray:
|
| 199 |
+
"""Standard forward pass (no weight modification)."""
|
| 200 |
+
x = np.asarray(x, dtype=np.float32)
|
| 201 |
+
y_pred, _ = self.net.forward(x)
|
| 202 |
+
# Clip to prevent exploding activations (optional but recommended for stability)
|
| 203 |
+
# Tanh output range is [-1, 1]; many real functions also fall in this range.
|
| 204 |
+
# Remove this line if your specific task requires unbounded output.
|
| 205 |
+
# y_pred = np.clip(y_pred, -10.0, 10.0) # Allow wider range than [-1, 1]
|
| 206 |
+
return y_pred
|
| 207 |
+
|
| 208 |
+
# ── dreaming ─────────────────────────────────────────────────────────────
|
| 209 |
+
|
| 210 |
+
def dream(
|
| 211 |
+
self,
|
| 212 |
+
num_steps: int = 128,
|
| 213 |
+
input_seed: Optional[np.ndarray] = None,
|
| 214 |
+
noise_sigma: float = 0.0,
|
| 215 |
+
blind_inputs: bool = False,
|
| 216 |
+
rng: Optional[np.random.Generator] = None,
|
| 217 |
+
) -> np.ndarray:
|
| 218 |
+
"""
|
| 219 |
+
Generate a sequence of outputs by driving a noisy shadow copy of the net.
|
| 220 |
+
|
| 221 |
+
The shadow copy is created once per dream() call; noise is injected
|
| 222 |
+
once into its weights before the loop starts. The trained net is
|
| 223 |
+
never modified.
|
| 224 |
+
|
| 225 |
+
Parameters
|
| 226 |
+
----------
|
| 227 |
+
num_steps : int
|
| 228 |
+
Number of output frames to generate.
|
| 229 |
+
input_seed : ndarray (1, input_dim), optional
|
| 230 |
+
Starting input. Defaults to zeros.
|
| 231 |
+
noise_sigma : float
|
| 232 |
+
Gaussian noise std added to shadow weights.
|
| 233 |
+
0.0 = exact replay of learned patterns (quiet)
|
| 234 |
+
0.05–0.3 = creative variation (sweet spot per Thaler's IEI)
|
| 235 |
+
>0.5 = chaotic / exploratory
|
| 236 |
+
blind_inputs : bool
|
| 237 |
+
If True, force input to zero each step (pure internal dynamics).
|
| 238 |
+
rng : np.random.Generator, optional
|
| 239 |
+
RNG for reproducibility.
|
| 240 |
+
|
| 241 |
+
Returns
|
| 242 |
+
-------
|
| 243 |
+
ndarray of shape (num_steps, output_dim)
|
| 244 |
+
"""
|
| 245 |
+
_rng = rng or np.random.default_rng()
|
| 246 |
+
|
| 247 |
+
# Shadow copy — noise applied once
|
| 248 |
+
shadow = self.net.clone()
|
| 249 |
+
shadow.apply_parameter_noise(noise_sigma, rng=_rng)
|
| 250 |
+
|
| 251 |
+
if input_seed is None:
|
| 252 |
+
x = np.zeros((1, self.net.input_dim), dtype=np.float32)
|
| 253 |
+
else:
|
| 254 |
+
x = np.asarray(input_seed, dtype=np.float32).reshape(1, -1)
|
| 255 |
+
|
| 256 |
+
outputs: List[np.ndarray] = []
|
| 257 |
+
for _ in range(num_steps):
|
| 258 |
+
x_step = np.zeros_like(x) if blind_inputs else x
|
| 259 |
+
y, _ = shadow.forward(x_step)
|
| 260 |
+
outputs.append(y.copy())
|
| 261 |
+
x = self._feedback_project(y)
|
| 262 |
+
|
| 263 |
+
return np.concatenate(outputs, axis=0) # (num_steps, output_dim)
|
| 264 |
+
|
| 265 |
+
def _feedback_project(self, y: np.ndarray) -> np.ndarray:
|
| 266 |
+
"""
|
| 267 |
+
Project dream output y back to input_dim for the next step.
|
| 268 |
+
|
| 269 |
+
Respects config.feedback_projection:
|
| 270 |
+
"repeat" — tile and truncate (no extra parameters)
|
| 271 |
+
"linear" — fixed random projection matrix
|
| 272 |
+
"zeros" — always feed zeros (pure internal dynamics)
|
| 273 |
+
"""
|
| 274 |
+
in_dim = self.net.input_dim
|
| 275 |
+
out_dim = self.net.output_dim
|
| 276 |
+
|
| 277 |
+
if in_dim == out_dim:
|
| 278 |
+
return y
|
| 279 |
+
|
| 280 |
+
mode = self.config.feedback_projection
|
| 281 |
+
|
| 282 |
+
if mode == "zeros":
|
| 283 |
+
return np.zeros((1, in_dim), dtype=np.float32)
|
| 284 |
+
|
| 285 |
+
if mode == "linear":
|
| 286 |
+
if self._projection is None:
|
| 287 |
+
# Initialise lazily if not done in __init__
|
| 288 |
+
self._projection = np.random.default_rng().normal(
|
| 289 |
+
0.0, 0.1, (out_dim, in_dim)
|
| 290 |
+
).astype(np.float32)
|
| 291 |
+
return y @ self._projection # (1, in_dim)
|
| 292 |
+
|
| 293 |
+
# default: "repeat" — tile output vector to fill input_dim
|
| 294 |
+
factor = in_dim // out_dim + 1
|
| 295 |
+
return np.repeat(y, factor, axis=1)[:, :in_dim]
|
| 296 |
+
|
| 297 |
+
# ── serialisation ─────────────────────────────────────────────────────────
|
| 298 |
+
|
| 299 |
+
def save(self, path: str) -> None:
|
| 300 |
+
"""Pickle the full STANNO to disk."""
|
| 301 |
+
with open(path, "wb") as f:
|
| 302 |
+
pickle.dump(self, f)
|
| 303 |
+
|
| 304 |
+
@staticmethod
|
| 305 |
+
def load(path: str) -> "STANNO":
|
| 306 |
+
"""Load a pickled STANNO from disk."""
|
| 307 |
+
with open(path, "rb") as f:
|
| 308 |
+
return pickle.load(f)
|
| 309 |
+
|
| 310 |
+
# ── misc ─────────────────────────────────────────────────────────────────
|
| 311 |
+
|
| 312 |
+
def __repr__(self) -> str:
|
| 313 |
+
return (
|
| 314 |
+
f"STANNO(layers={self.config.layers}, "
|
| 315 |
+
f"trainer={self.trainer.__class__.__name__}, "
|
| 316 |
+
f"backend={self.config.backend})"
|
| 317 |
+
)
|
stanno/core/trainee.py
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
TraineeNet — the network that gets trained by a TrainerNet.
|
| 3 |
+
|
| 4 |
+
Key differences from stanno_poc.py:
|
| 5 |
+
- Variable depth: layers: List[int] replaces input/hidden/output_dim
|
| 6 |
+
- Backend-injected: all ops go through BackendProtocol for NumPy/Torch compat
|
| 7 |
+
- forward() returns a TraineeState capturing activations AND pre-activations
|
| 8 |
+
- apply_updates() receives (weight_deltas, bias_deltas) from TrainerNet
|
| 9 |
+
- Xavier initialisation, tanh hidden activations, linear output (same as poc)
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
from __future__ import annotations
|
| 13 |
+
import copy
|
| 14 |
+
from typing import List, Optional, Tuple
|
| 15 |
+
|
| 16 |
+
import numpy as np
|
| 17 |
+
|
| 18 |
+
from stanno.core.backend import BackendProtocol, NumPyBackend
|
| 19 |
+
from stanno.core.trainer import TraineeState
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class TraineeNet:
|
| 23 |
+
"""
|
| 24 |
+
N-layer MLP (input → hidden₁ → … → hiddenₙ₋₁ → output).
|
| 25 |
+
|
| 26 |
+
The output layer is always linear (no activation).
|
| 27 |
+
All hidden layers use tanh activation.
|
| 28 |
+
|
| 29 |
+
Parameters
|
| 30 |
+
----------
|
| 31 |
+
layers : List[int]
|
| 32 |
+
Sizes of each layer including input and output.
|
| 33 |
+
Must have at least 2 elements. Example: [1, 32, 1].
|
| 34 |
+
backend : BackendProtocol, optional
|
| 35 |
+
Numerical backend. Defaults to NumPyBackend.
|
| 36 |
+
rng : np.random.Generator, optional
|
| 37 |
+
Seeded RNG for reproducibility.
|
| 38 |
+
"""
|
| 39 |
+
|
| 40 |
+
def __init__(
|
| 41 |
+
self,
|
| 42 |
+
layers: List[int],
|
| 43 |
+
backend: Optional[BackendProtocol] = None,
|
| 44 |
+
rng: Optional[np.random.Generator] = None,
|
| 45 |
+
) -> None:
|
| 46 |
+
if len(layers) < 2:
|
| 47 |
+
raise ValueError(f"layers must have at least 2 elements, got {layers}")
|
| 48 |
+
self.layers = layers
|
| 49 |
+
self.backend = backend or NumPyBackend()
|
| 50 |
+
self._rng = rng or np.random.default_rng()
|
| 51 |
+
self._init_weights()
|
| 52 |
+
|
| 53 |
+
# ── initialisation ───────────────────────────────────────────────────────
|
| 54 |
+
|
| 55 |
+
def _init_weights(self) -> None:
|
| 56 |
+
self.weights: List[np.ndarray] = []
|
| 57 |
+
self.biases: List[np.ndarray] = []
|
| 58 |
+
for i in range(len(self.layers) - 1):
|
| 59 |
+
fan_in = self.layers[i]
|
| 60 |
+
fan_out = self.layers[i + 1]
|
| 61 |
+
# Xavier uniform
|
| 62 |
+
limit = float(np.sqrt(6.0 / (fan_in + fan_out)))
|
| 63 |
+
W = self._rng.uniform(-limit, limit, (fan_in, fan_out)).astype(np.float32)
|
| 64 |
+
b = np.zeros((1, fan_out), dtype=np.float32)
|
| 65 |
+
self.weights.append(W)
|
| 66 |
+
self.biases.append(b)
|
| 67 |
+
|
| 68 |
+
# ── forward pass ─────────────────────────────────────────────────────────
|
| 69 |
+
|
| 70 |
+
def forward(self, x: np.ndarray) -> Tuple[np.ndarray, TraineeState]:
|
| 71 |
+
"""
|
| 72 |
+
Run a forward pass.
|
| 73 |
+
|
| 74 |
+
Returns the prediction and a TraineeState populated with:
|
| 75 |
+
activations[0] = x (input)
|
| 76 |
+
activations[i] = tanh(z_{i-1}) for i = 1 … n-1
|
| 77 |
+
activations[n] = z_{n-1} (linear output = y_pred)
|
| 78 |
+
pre_activations[i] = z_i = activations[i] @ W_i + b_i
|
| 79 |
+
where n = len(self.weights).
|
| 80 |
+
"""
|
| 81 |
+
x = np.asarray(x, dtype=np.float32)
|
| 82 |
+
activations: List[np.ndarray] = [x]
|
| 83 |
+
pre_activations: List[np.ndarray] = []
|
| 84 |
+
|
| 85 |
+
current = x
|
| 86 |
+
for i, (W, b) in enumerate(zip(self.weights, self.biases)):
|
| 87 |
+
z = current @ W + b
|
| 88 |
+
pre_activations.append(z)
|
| 89 |
+
if i < len(self.weights) - 1:
|
| 90 |
+
a = np.tanh(z)
|
| 91 |
+
else:
|
| 92 |
+
a = z # linear output
|
| 93 |
+
activations.append(a)
|
| 94 |
+
current = a
|
| 95 |
+
|
| 96 |
+
y_pred = current
|
| 97 |
+
|
| 98 |
+
state = TraineeState(
|
| 99 |
+
weights=self.weights, # live references — no copy
|
| 100 |
+
biases=self.biases,
|
| 101 |
+
activations=activations,
|
| 102 |
+
pre_activations=pre_activations,
|
| 103 |
+
x_batch=x,
|
| 104 |
+
y_batch=np.zeros_like(y_pred), # filled by STANNO._trainer_step
|
| 105 |
+
y_pred=y_pred,
|
| 106 |
+
)
|
| 107 |
+
return y_pred, state
|
| 108 |
+
|
| 109 |
+
# ── weight manipulation ──────────────────────────────────────────────────
|
| 110 |
+
|
| 111 |
+
def apply_updates(
|
| 112 |
+
self,
|
| 113 |
+
weight_deltas: List[np.ndarray],
|
| 114 |
+
bias_deltas: List[np.ndarray],
|
| 115 |
+
) -> None:
|
| 116 |
+
"""Apply in-place updates produced by a TrainerNet."""
|
| 117 |
+
for i, (dW, db) in enumerate(zip(weight_deltas, bias_deltas)):
|
| 118 |
+
self.weights[i] += dW
|
| 119 |
+
self.biases[i] += db
|
| 120 |
+
|
| 121 |
+
def apply_parameter_noise(
|
| 122 |
+
self,
|
| 123 |
+
sigma: float,
|
| 124 |
+
rng: Optional[np.random.Generator] = None,
|
| 125 |
+
) -> None:
|
| 126 |
+
"""Add Gaussian noise (std=sigma) to all weights and biases in-place."""
|
| 127 |
+
if sigma <= 0.0:
|
| 128 |
+
return
|
| 129 |
+
_rng = rng or np.random.default_rng()
|
| 130 |
+
for W in self.weights:
|
| 131 |
+
W += _rng.normal(0.0, sigma, W.shape).astype(np.float32)
|
| 132 |
+
for b in self.biases:
|
| 133 |
+
b += _rng.normal(0.0, sigma, b.shape).astype(np.float32)
|
| 134 |
+
|
| 135 |
+
def lesion(
|
| 136 |
+
self,
|
| 137 |
+
fraction: float,
|
| 138 |
+
rng: Optional[np.random.Generator] = None,
|
| 139 |
+
) -> None:
|
| 140 |
+
"""
|
| 141 |
+
Randomly zero out a fraction of weight entries (not biases).
|
| 142 |
+
|
| 143 |
+
Simulates progressive connection death as described in the original
|
| 144 |
+
STANNO literature ("tunnel vision" → "brain death" progression).
|
| 145 |
+
"""
|
| 146 |
+
fraction = float(np.clip(fraction, 0.0, 1.0))
|
| 147 |
+
if fraction <= 0.0:
|
| 148 |
+
return
|
| 149 |
+
_rng = rng or np.random.default_rng()
|
| 150 |
+
for W in self.weights:
|
| 151 |
+
mask = _rng.random(W.shape) < fraction
|
| 152 |
+
W[mask] = 0.0
|
| 153 |
+
|
| 154 |
+
def clone(self) -> "TraineeNet":
|
| 155 |
+
"""Return a deep copy with the same weights."""
|
| 156 |
+
return copy.deepcopy(self)
|
| 157 |
+
|
| 158 |
+
# ── misc ─────────────────────────────────────────────────────────────────
|
| 159 |
+
|
| 160 |
+
@property
|
| 161 |
+
def input_dim(self) -> int:
|
| 162 |
+
return self.layers[0]
|
| 163 |
+
|
| 164 |
+
@property
|
| 165 |
+
def output_dim(self) -> int:
|
| 166 |
+
return self.layers[-1]
|
| 167 |
+
|
| 168 |
+
def parameter_count(self) -> int:
|
| 169 |
+
total = sum(W.size for W in self.weights)
|
| 170 |
+
total += sum(b.size for b in self.biases)
|
| 171 |
+
return total
|
| 172 |
+
|
| 173 |
+
def __repr__(self) -> str:
|
| 174 |
+
return f"TraineeNet(layers={self.layers}, params={self.parameter_count():,})"
|
stanno/core/trainer.py
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
TraineeState dataclass and AbstractTrainerNet ABC.
|
| 3 |
+
|
| 4 |
+
TraineeState carries everything a TrainerNet may need:
|
| 5 |
+
- current weight matrices and biases
|
| 6 |
+
- forward-pass activations (post-activation) and pre-activations (z = Wx+b)
|
| 7 |
+
- the current batch inputs, targets, and predictions
|
| 8 |
+
- an optional loss history window
|
| 9 |
+
- a slot for pre-computed gradients (used by some trainers)
|
| 10 |
+
|
| 11 |
+
AbstractTrainerNet defines the interface every TrainerNet must implement.
|
| 12 |
+
The only required method is compute_updates(); meta_train, save, and load
|
| 13 |
+
have default no-op implementations so simple trainers don't need them.
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
from __future__ import annotations
|
| 17 |
+
from abc import ABC, abstractmethod
|
| 18 |
+
from dataclasses import dataclass, field
|
| 19 |
+
from typing import List, Optional, Tuple
|
| 20 |
+
|
| 21 |
+
import numpy as np
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
# ─── TraineeState ─────────────────────────────────────────────────────────────
|
| 25 |
+
|
| 26 |
+
@dataclass
|
| 27 |
+
class TraineeState:
|
| 28 |
+
"""
|
| 29 |
+
Snapshot of TraineeNet state after a forward pass, ready for the trainer.
|
| 30 |
+
|
| 31 |
+
Index convention (n = number of weight matrices):
|
| 32 |
+
activations[0] = input x shape (batch, layers[0])
|
| 33 |
+
activations[i] = a_i shape (batch, layers[i])
|
| 34 |
+
activations[n] = y_pred shape (batch, layers[n])
|
| 35 |
+
pre_activations[i] = z_i = a_i @ W_i + b_i shape (batch, layers[i+1])
|
| 36 |
+
|
| 37 |
+
So len(activations) == n + 1 and len(pre_activations) == n.
|
| 38 |
+
"""
|
| 39 |
+
|
| 40 |
+
weights: List[np.ndarray] # [W_0, ..., W_{n-1}]
|
| 41 |
+
biases: List[np.ndarray] # [b_0, ..., b_{n-1}]
|
| 42 |
+
activations: List[np.ndarray] # [x, a_1, ..., y_pred]
|
| 43 |
+
pre_activations: List[np.ndarray] # [z_0, ..., z_{n-1}]
|
| 44 |
+
x_batch: np.ndarray # current batch inputs
|
| 45 |
+
y_batch: np.ndarray # current batch targets (set by STANNO)
|
| 46 |
+
y_pred: np.ndarray # current predictions
|
| 47 |
+
loss_history: List[float] = field(default_factory=list)
|
| 48 |
+
gradients: Optional[List[np.ndarray]] = None # optional pre-computed ∂L/∂W
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
# ─── AbstractTrainerNet ───────────────────────────────────────────────────────
|
| 52 |
+
|
| 53 |
+
class AbstractTrainerNet(ABC):
|
| 54 |
+
"""
|
| 55 |
+
Base class for all TrainerNet implementations.
|
| 56 |
+
|
| 57 |
+
Subclasses must implement compute_updates(). meta_train, save, and load
|
| 58 |
+
are optional — the default implementations are harmless no-ops.
|
| 59 |
+
"""
|
| 60 |
+
|
| 61 |
+
@abstractmethod
|
| 62 |
+
def compute_updates(
|
| 63 |
+
self, state: TraineeState
|
| 64 |
+
) -> Tuple[List[np.ndarray], List[np.ndarray]]:
|
| 65 |
+
"""
|
| 66 |
+
Compute weight and bias updates given the current TraineeState.
|
| 67 |
+
|
| 68 |
+
Returns:
|
| 69 |
+
weight_deltas: list of Δ arrays, same shapes as state.weights
|
| 70 |
+
bias_deltas: list of Δ arrays, same shapes as state.biases
|
| 71 |
+
|
| 72 |
+
The STANNO orchestrator applies these as:
|
| 73 |
+
W_i += weight_deltas[i]
|
| 74 |
+
b_i += bias_deltas[i]
|
| 75 |
+
"""
|
| 76 |
+
|
| 77 |
+
def compute_cascade_updates(
|
| 78 |
+
self,
|
| 79 |
+
state: "TraineeState",
|
| 80 |
+
output_delta: Optional[np.ndarray] = None,
|
| 81 |
+
) -> Tuple[List[np.ndarray], List[np.ndarray], Optional[np.ndarray]]:
|
| 82 |
+
"""
|
| 83 |
+
Like compute_updates(), but supports multi-stage cascade training.
|
| 84 |
+
|
| 85 |
+
output_delta : ndarray (batch, output_dim), optional
|
| 86 |
+
Pre-computed gradient at the output layer from a downstream stage.
|
| 87 |
+
When None the trainer computes the output delta from state.y_batch
|
| 88 |
+
as usual.
|
| 89 |
+
|
| 90 |
+
Returns
|
| 91 |
+
-------
|
| 92 |
+
weight_deltas : List[ndarray]
|
| 93 |
+
bias_deltas : List[ndarray]
|
| 94 |
+
input_gradient : ndarray (batch, input_dim) or None
|
| 95 |
+
Gradient w.r.t. the network's input activations. Pass this as
|
| 96 |
+
``output_delta`` to the preceding stage in a CascadeSTANNO.
|
| 97 |
+
The default implementation returns ``None`` (no gradient flows
|
| 98 |
+
backward past this stage). FixedTrainerNet overrides this.
|
| 99 |
+
"""
|
| 100 |
+
dW, db = self.compute_updates(state)
|
| 101 |
+
return dW, db, None
|
| 102 |
+
|
| 103 |
+
def meta_train(self, tasks) -> None:
|
| 104 |
+
"""
|
| 105 |
+
Optional: adapt this TrainerNet's own parameters on a list of tasks.
|
| 106 |
+
|
| 107 |
+
Each task is expected to be a dict with at least:
|
| 108 |
+
{"x_train": ndarray, "y_train": ndarray,
|
| 109 |
+
"x_test": ndarray, "y_test": ndarray}
|
| 110 |
+
|
| 111 |
+
Default: no-op. Override in LocalRuleTrainerNet and
|
| 112 |
+
EvolutionaryTrainerNet.
|
| 113 |
+
"""
|
| 114 |
+
|
| 115 |
+
def save(self, path: str) -> None:
|
| 116 |
+
"""Persist trainer parameters to disk. Default: pickle."""
|
| 117 |
+
import pickle
|
| 118 |
+
with open(path, "wb") as f:
|
| 119 |
+
pickle.dump(self, f)
|
| 120 |
+
|
| 121 |
+
def load(self, path: str) -> None:
|
| 122 |
+
"""Load trainer parameters from disk. Default: merge from pickle."""
|
| 123 |
+
import pickle
|
| 124 |
+
with open(path, "rb") as f:
|
| 125 |
+
loaded = pickle.load(f)
|
| 126 |
+
self.__dict__.update(loaded.__dict__)
|
stanno/data/__init__.py
ADDED
|
File without changes
|
stanno/data/base.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Data loading abstractions.
|
| 3 |
+
|
| 4 |
+
DataSource is the ABC that all loaders implement.
|
| 5 |
+
A single make_loader() factory creates the right loader from a DataConfig.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
from abc import ABC, abstractmethod
|
| 10 |
+
from typing import Tuple
|
| 11 |
+
|
| 12 |
+
import numpy as np
|
| 13 |
+
|
| 14 |
+
from stanno.config.schema import DataConfig
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class DataSource(ABC):
|
| 18 |
+
"""Abstract base class for all data loaders."""
|
| 19 |
+
|
| 20 |
+
def __init__(self, config: DataConfig) -> None:
|
| 21 |
+
self.config = config
|
| 22 |
+
|
| 23 |
+
@abstractmethod
|
| 24 |
+
def load(self) -> Tuple[np.ndarray, np.ndarray]:
|
| 25 |
+
"""
|
| 26 |
+
Load and return (X, Y) arrays of shape (N, input_dim) and (N, output_dim).
|
| 27 |
+
"""
|
| 28 |
+
|
| 29 |
+
def _train_test_split(
|
| 30 |
+
self,
|
| 31 |
+
x: np.ndarray,
|
| 32 |
+
y: np.ndarray,
|
| 33 |
+
) -> Tuple[np.ndarray, np.ndarray]:
|
| 34 |
+
"""Apply config.split_ratio to return the training portion only."""
|
| 35 |
+
n = int(len(x) * self.config.split_ratio)
|
| 36 |
+
return x[:n], y[:n]
|
| 37 |
+
|
| 38 |
+
def _normalize(
|
| 39 |
+
self,
|
| 40 |
+
x: np.ndarray,
|
| 41 |
+
y: np.ndarray,
|
| 42 |
+
) -> Tuple[np.ndarray, np.ndarray]:
|
| 43 |
+
"""Z-score normalise inputs and outputs independently."""
|
| 44 |
+
x_mean, x_std = x.mean(axis=0), x.std(axis=0)
|
| 45 |
+
y_mean, y_std = y.mean(axis=0), y.std(axis=0)
|
| 46 |
+
x_std = np.where(x_std < 1e-8, 1.0, x_std)
|
| 47 |
+
y_std = np.where(y_std < 1e-8, 1.0, y_std)
|
| 48 |
+
return (x - x_mean) / x_std, (y - y_mean) / y_std
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def make_loader(config: DataConfig) -> DataSource:
|
| 52 |
+
"""
|
| 53 |
+
Factory function: choose a DataSource implementation from config.format.
|
| 54 |
+
|
| 55 |
+
Supported formats:
|
| 56 |
+
"csv" — CSV file at config.path
|
| 57 |
+
"json" — JSON / JSONL file at config.path
|
| 58 |
+
"numpy" — .npy or .npz file at config.path
|
| 59 |
+
"builtin:sin" — synthetic sinusoid (for quick tests)
|
| 60 |
+
"""
|
| 61 |
+
fmt = (config.format or "").lower()
|
| 62 |
+
|
| 63 |
+
if fmt == "csv":
|
| 64 |
+
from stanno.data.csv_loader import CSVLoader
|
| 65 |
+
return CSVLoader(config)
|
| 66 |
+
|
| 67 |
+
if fmt == "json":
|
| 68 |
+
from stanno.data.json_loader import JSONLoader
|
| 69 |
+
return JSONLoader(config)
|
| 70 |
+
|
| 71 |
+
if fmt in ("numpy", "npy", "npz"):
|
| 72 |
+
from stanno.data.numpy_loader import NumpyLoader
|
| 73 |
+
return NumpyLoader(config)
|
| 74 |
+
|
| 75 |
+
if fmt.startswith("builtin:"):
|
| 76 |
+
from stanno.data.numpy_loader import BuiltinLoader
|
| 77 |
+
return BuiltinLoader(config)
|
| 78 |
+
|
| 79 |
+
raise ValueError(
|
| 80 |
+
f"Unknown data format {config.format!r}. "
|
| 81 |
+
"Supported: 'csv', 'json', 'numpy', 'builtin:sin'."
|
| 82 |
+
)
|
stanno/data/csv_loader.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
CSV data loader.
|
| 3 |
+
|
| 4 |
+
Requires pandas. Install with: pip install stanno[data]
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
from typing import Tuple
|
| 9 |
+
|
| 10 |
+
import numpy as np
|
| 11 |
+
|
| 12 |
+
from stanno.config.schema import DataConfig
|
| 13 |
+
from stanno.data.base import DataSource
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class CSVLoader(DataSource):
|
| 17 |
+
"""
|
| 18 |
+
Load supervised learning data from a CSV file.
|
| 19 |
+
|
| 20 |
+
Column selection
|
| 21 |
+
────────────────
|
| 22 |
+
If DataConfig.input_cols and output_cols are specified (as lists of
|
| 23 |
+
column names or 0-based integers), those columns are used directly.
|
| 24 |
+
|
| 25 |
+
Otherwise the convention is:
|
| 26 |
+
• All but the last column → X (inputs)
|
| 27 |
+
• Last column → Y (targets)
|
| 28 |
+
"""
|
| 29 |
+
|
| 30 |
+
def load(self) -> Tuple[np.ndarray, np.ndarray]:
|
| 31 |
+
try:
|
| 32 |
+
import pandas as pd
|
| 33 |
+
except ImportError as exc:
|
| 34 |
+
raise ImportError(
|
| 35 |
+
"pandas is required for CSV loading. "
|
| 36 |
+
"Install it with: pip install stanno[data]"
|
| 37 |
+
) from exc
|
| 38 |
+
|
| 39 |
+
df = pd.read_csv(self.config.path)
|
| 40 |
+
|
| 41 |
+
if self.config.input_cols and self.config.output_cols:
|
| 42 |
+
x = df[self.config.input_cols].to_numpy(dtype=np.float32)
|
| 43 |
+
y = df[self.config.output_cols].to_numpy(dtype=np.float32)
|
| 44 |
+
else:
|
| 45 |
+
x = df.iloc[:, :-1].to_numpy(dtype=np.float32)
|
| 46 |
+
y = df.iloc[:, -1:].to_numpy(dtype=np.float32)
|
| 47 |
+
|
| 48 |
+
if self.config.n_samples is not None and self.config.n_samples < len(x):
|
| 49 |
+
x = x[: self.config.n_samples]
|
| 50 |
+
y = y[: self.config.n_samples]
|
| 51 |
+
|
| 52 |
+
if self.config.normalize:
|
| 53 |
+
x, y = self._normalize(x, y)
|
| 54 |
+
|
| 55 |
+
x, y = self._train_test_split(x, y)
|
| 56 |
+
return x, y
|
stanno/data/json_loader.py
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
JSON / JSONL data loader.
|
| 3 |
+
|
| 4 |
+
Handles two formats:
|
| 5 |
+
• Regular JSON — a list of dicts or a dict with "x"/"y" keys
|
| 6 |
+
• JSONL — one JSON object per line (auto-detected)
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
from __future__ import annotations
|
| 10 |
+
import json
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
from typing import Tuple
|
| 13 |
+
|
| 14 |
+
import numpy as np
|
| 15 |
+
|
| 16 |
+
from stanno.config.schema import DataConfig
|
| 17 |
+
from stanno.data.base import DataSource
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class JSONLoader(DataSource):
|
| 21 |
+
"""
|
| 22 |
+
Load supervised learning data from a JSON or JSONL file.
|
| 23 |
+
|
| 24 |
+
Accepted structures
|
| 25 |
+
───────────────────
|
| 26 |
+
1. Dict with "x" and "y" keys:
|
| 27 |
+
{"x": [[...], ...], "y": [[...], ...]}
|
| 28 |
+
|
| 29 |
+
2. List of dicts using input_cols / output_cols as keys:
|
| 30 |
+
[{"feature_a": 1.0, "feature_b": 2.0, "label": 0.0}, ...]
|
| 31 |
+
|
| 32 |
+
3. JSONL (one JSON object per line) — same structure per line.
|
| 33 |
+
"""
|
| 34 |
+
|
| 35 |
+
def load(self) -> Tuple[np.ndarray, np.ndarray]:
|
| 36 |
+
path = Path(self.config.path)
|
| 37 |
+
text = path.read_text(encoding="utf-8")
|
| 38 |
+
|
| 39 |
+
# Auto-detect JSONL
|
| 40 |
+
lines = [l for l in text.strip().splitlines() if l.strip()]
|
| 41 |
+
if len(lines) > 1:
|
| 42 |
+
try:
|
| 43 |
+
records = [json.loads(l) for l in lines]
|
| 44 |
+
data = records # list of dicts
|
| 45 |
+
except json.JSONDecodeError:
|
| 46 |
+
data = json.loads(text)
|
| 47 |
+
else:
|
| 48 |
+
data = json.loads(text)
|
| 49 |
+
|
| 50 |
+
x, y = self._parse(data)
|
| 51 |
+
|
| 52 |
+
if self.config.n_samples is not None and self.config.n_samples < len(x):
|
| 53 |
+
x = x[: self.config.n_samples]
|
| 54 |
+
y = y[: self.config.n_samples]
|
| 55 |
+
|
| 56 |
+
if self.config.normalize:
|
| 57 |
+
x, y = self._normalize(x, y)
|
| 58 |
+
|
| 59 |
+
return self._train_test_split(x, y)
|
| 60 |
+
|
| 61 |
+
def _parse(self, data) -> Tuple[np.ndarray, np.ndarray]:
|
| 62 |
+
# Format 1: {"x": ..., "y": ...}
|
| 63 |
+
if isinstance(data, dict) and "x" in data and "y" in data:
|
| 64 |
+
x = np.array(data["x"], dtype=np.float32)
|
| 65 |
+
y = np.array(data["y"], dtype=np.float32)
|
| 66 |
+
if x.ndim == 1:
|
| 67 |
+
x = x.reshape(-1, 1)
|
| 68 |
+
if y.ndim == 1:
|
| 69 |
+
y = y.reshape(-1, 1)
|
| 70 |
+
return x, y
|
| 71 |
+
|
| 72 |
+
# Format 2 / 3: list of dicts
|
| 73 |
+
if isinstance(data, list) and len(data) > 0 and isinstance(data[0], dict):
|
| 74 |
+
if self.config.input_cols and self.config.output_cols:
|
| 75 |
+
x = np.array(
|
| 76 |
+
[[rec[k] for k in self.config.input_cols] for rec in data],
|
| 77 |
+
dtype=np.float32,
|
| 78 |
+
)
|
| 79 |
+
y = np.array(
|
| 80 |
+
[[rec[k] for k in self.config.output_cols] for rec in data],
|
| 81 |
+
dtype=np.float32,
|
| 82 |
+
)
|
| 83 |
+
else:
|
| 84 |
+
# fallback: all keys except last as X; last key as Y
|
| 85 |
+
keys = list(data[0].keys())
|
| 86 |
+
x = np.array([[rec[k] for k in keys[:-1]] for rec in data], dtype=np.float32)
|
| 87 |
+
y = np.array([[rec[keys[-1]]] for rec in data], dtype=np.float32)
|
| 88 |
+
return x, y
|
| 89 |
+
|
| 90 |
+
# Format: list of lists (raw matrix)
|
| 91 |
+
if isinstance(data, list) and len(data) > 0 and isinstance(data[0], (list, tuple)):
|
| 92 |
+
arr = np.array(data, dtype=np.float32)
|
| 93 |
+
return arr[:, :-1], arr[:, -1:]
|
| 94 |
+
|
| 95 |
+
raise ValueError(
|
| 96 |
+
"Unsupported JSON structure. Expected {'x': ..., 'y': ...}, "
|
| 97 |
+
"list of dicts, or list of lists."
|
| 98 |
+
)
|
stanno/data/numpy_loader.py
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
NumPy / built-in data loaders.
|
| 3 |
+
|
| 4 |
+
NumpyLoader handles .npy and .npz files.
|
| 5 |
+
BuiltinLoader generates synthetic datasets for quick tests.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
from typing import Tuple
|
| 10 |
+
|
| 11 |
+
import numpy as np
|
| 12 |
+
|
| 13 |
+
from stanno.config.schema import DataConfig
|
| 14 |
+
from stanno.data.base import DataSource
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class NumpyLoader(DataSource):
|
| 18 |
+
"""
|
| 19 |
+
Load data from a NumPy .npy or .npz file.
|
| 20 |
+
|
| 21 |
+
.npy files
|
| 22 |
+
──────────
|
| 23 |
+
The array is treated as an autoencoder target: X = Y = loaded array.
|
| 24 |
+
Useful for unsupervised reconstruction or embedding normalisation tasks.
|
| 25 |
+
|
| 26 |
+
.npz files
|
| 27 |
+
──────────
|
| 28 |
+
Expected keys: "x" and "y" (case-insensitive).
|
| 29 |
+
Falls back to autoencoder mode if only one key is present.
|
| 30 |
+
"""
|
| 31 |
+
|
| 32 |
+
def load(self) -> Tuple[np.ndarray, np.ndarray]:
|
| 33 |
+
path = self.config.path
|
| 34 |
+
|
| 35 |
+
if path.endswith(".npz"):
|
| 36 |
+
archive = np.load(path)
|
| 37 |
+
keys = list(archive.files)
|
| 38 |
+
lower_keys = {k.lower(): k for k in keys}
|
| 39 |
+
if "x" in lower_keys and "y" in lower_keys:
|
| 40 |
+
x = archive[lower_keys["x"]].astype(np.float32)
|
| 41 |
+
y = archive[lower_keys["y"]].astype(np.float32)
|
| 42 |
+
elif len(keys) >= 2:
|
| 43 |
+
x = archive[keys[0]].astype(np.float32)
|
| 44 |
+
y = archive[keys[1]].astype(np.float32)
|
| 45 |
+
else:
|
| 46 |
+
arr = archive[keys[0]].astype(np.float32)
|
| 47 |
+
x = y = arr # autoencoder
|
| 48 |
+
else:
|
| 49 |
+
arr = np.load(path).astype(np.float32)
|
| 50 |
+
x = y = arr # autoencoder
|
| 51 |
+
|
| 52 |
+
if x.ndim == 1:
|
| 53 |
+
x = x.reshape(-1, 1)
|
| 54 |
+
if y.ndim == 1:
|
| 55 |
+
y = y.reshape(-1, 1)
|
| 56 |
+
|
| 57 |
+
if self.config.n_samples is not None and self.config.n_samples < len(x):
|
| 58 |
+
x = x[: self.config.n_samples]
|
| 59 |
+
y = y[: self.config.n_samples]
|
| 60 |
+
|
| 61 |
+
if self.config.normalize:
|
| 62 |
+
x, y = self._normalize(x, y)
|
| 63 |
+
|
| 64 |
+
return self._train_test_split(x, y)
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
class BuiltinLoader(DataSource):
|
| 68 |
+
"""
|
| 69 |
+
Synthetic data generator for built-in demo tasks.
|
| 70 |
+
|
| 71 |
+
Supported values for DataConfig.format:
|
| 72 |
+
"builtin:sin" — y = sin(2π·x), x ∈ [0, 1]
|
| 73 |
+
"builtin:xor" — 2D XOR classification
|
| 74 |
+
"builtin:spiral" — 2-class spiral (n_samples points)
|
| 75 |
+
"""
|
| 76 |
+
|
| 77 |
+
def load(self) -> Tuple[np.ndarray, np.ndarray]:
|
| 78 |
+
name = self.config.format.split(":", 1)[-1].lower()
|
| 79 |
+
n = self.config.n_samples or 512
|
| 80 |
+
rng = np.random.default_rng(42)
|
| 81 |
+
|
| 82 |
+
if name == "sin":
|
| 83 |
+
x = np.linspace(0.0, 1.0, n, dtype=np.float32).reshape(-1, 1)
|
| 84 |
+
y = np.sin(2.0 * np.pi * x).astype(np.float32)
|
| 85 |
+
|
| 86 |
+
elif name == "xor":
|
| 87 |
+
pts = rng.choice(np.array([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=np.float32),
|
| 88 |
+
size=n, replace=True)
|
| 89 |
+
labels = (pts[:, 0].astype(int) ^ pts[:, 1].astype(int)).astype(np.float32)
|
| 90 |
+
x = pts + rng.normal(0.0, 0.05, pts.shape).astype(np.float32)
|
| 91 |
+
y = labels.reshape(-1, 1)
|
| 92 |
+
|
| 93 |
+
elif name == "spiral":
|
| 94 |
+
n_half = n // 2
|
| 95 |
+
theta = np.linspace(0.0, 4 * np.pi, n_half, dtype=np.float32)
|
| 96 |
+
r = np.linspace(0.1, 1.0, n_half, dtype=np.float32)
|
| 97 |
+
x1 = np.stack([r * np.cos(theta), r * np.sin(theta)], axis=1)
|
| 98 |
+
x2 = np.stack([r * np.cos(theta + np.pi), r * np.sin(theta + np.pi)], axis=1)
|
| 99 |
+
x = np.concatenate([x1, x2], axis=0).astype(np.float32)
|
| 100 |
+
x += rng.normal(0.0, 0.05, x.shape).astype(np.float32)
|
| 101 |
+
y = np.array([0.0] * n_half + [1.0] * n_half, dtype=np.float32).reshape(-1, 1)
|
| 102 |
+
|
| 103 |
+
else:
|
| 104 |
+
raise ValueError(
|
| 105 |
+
f"Unknown builtin dataset {name!r}. "
|
| 106 |
+
"Supported: 'builtin:sin', 'builtin:xor', 'builtin:spiral'."
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
if self.config.normalize:
|
| 110 |
+
x, y = self._normalize(x, y)
|
| 111 |
+
|
| 112 |
+
return self._train_test_split(x, y)
|
stanno/integration/__init__.py
ADDED
|
File without changes
|
stanno/integration/cascade.py
ADDED
|
@@ -0,0 +1,354 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
CascadeSTANNO — chain of STANNO objects forming a composite system model.
|
| 3 |
+
|
| 4 |
+
Implements the "cascading networks" concept from US5852815A (Thaler, 1998):
|
| 5 |
+
|
| 6 |
+
"Once multiple component-networks have been trained, they can be
|
| 7 |
+
cascaded — connected together — to form larger system models.
|
| 8 |
+
The outputs of one component-network feed into the inputs of another;
|
| 9 |
+
several components form a chain or a more complex topology representing
|
| 10 |
+
a device or system."
|
| 11 |
+
|
| 12 |
+
The patent also describes hybrid static/dynamic cascades:
|
| 13 |
+
|
| 14 |
+
"A component-network might be locked once it reaches satisfactory
|
| 15 |
+
performance. Another component (implemented as a STANNO) continues
|
| 16 |
+
to adapt to new operating conditions."
|
| 17 |
+
|
| 18 |
+
This is captured by the `frozen` flag per stage.
|
| 19 |
+
|
| 20 |
+
Training modes
|
| 21 |
+
──────────────
|
| 22 |
+
end-to-end (default)
|
| 23 |
+
Gradient flows backward from the last stage to the first using the
|
| 24 |
+
cascade-aware ``compute_cascade_updates()`` in FixedTrainerNet.
|
| 25 |
+
Frozen stages skip weight updates but still pass gradients through.
|
| 26 |
+
|
| 27 |
+
staged
|
| 28 |
+
Each stage is trained independently on provided intermediate targets
|
| 29 |
+
(useful when you have supervision at every stage, e.g. multi-task
|
| 30 |
+
pipelines).
|
| 31 |
+
"""
|
| 32 |
+
|
| 33 |
+
from __future__ import annotations
|
| 34 |
+
|
| 35 |
+
import copy
|
| 36 |
+
import pickle
|
| 37 |
+
from typing import List, Optional, Tuple
|
| 38 |
+
|
| 39 |
+
import numpy as np
|
| 40 |
+
|
| 41 |
+
from stanno.core.stanno import STANNO
|
| 42 |
+
from stanno.config.schema import STANNOConfig
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
class CascadeSTANNO:
|
| 46 |
+
"""
|
| 47 |
+
Ordered chain of STANNO objects.
|
| 48 |
+
|
| 49 |
+
Parameters
|
| 50 |
+
----------
|
| 51 |
+
stages : List[STANNO]
|
| 52 |
+
Pre-built STANNO objects in pipeline order. Output dim of stage k
|
| 53 |
+
must equal input dim of stage k+1.
|
| 54 |
+
frozen : List[bool], optional
|
| 55 |
+
Per-stage freeze flags. Frozen stages receive no weight updates.
|
| 56 |
+
Gradients still flow through them during end-to-end training.
|
| 57 |
+
|
| 58 |
+
Example
|
| 59 |
+
-------
|
| 60 |
+
>>> from stanno import STANNO, STANNOConfig
|
| 61 |
+
>>> from stanno.integration.cascade import CascadeSTANNO
|
| 62 |
+
>>> enc = STANNO(STANNOConfig(layers=[784, 128, 32]))
|
| 63 |
+
>>> dec = STANNO(STANNOConfig(layers=[32, 128, 784]))
|
| 64 |
+
>>> ae = CascadeSTANNO([enc, dec])
|
| 65 |
+
>>> ae.fit(x_train, x_train, epochs=200) # autoencoder
|
| 66 |
+
>>> codes = ae.intermediate_output(x_train, stage=0) # encoder output
|
| 67 |
+
"""
|
| 68 |
+
|
| 69 |
+
def __init__(
|
| 70 |
+
self,
|
| 71 |
+
stages: Optional[List[STANNO]] = None,
|
| 72 |
+
frozen: Optional[List[bool]] = None,
|
| 73 |
+
) -> None:
|
| 74 |
+
self.stages: List[STANNO] = list(stages) if stages else []
|
| 75 |
+
self.frozen: List[bool] = list(frozen) if frozen else [False] * len(self.stages)
|
| 76 |
+
self._loss_history: List[float] = []
|
| 77 |
+
|
| 78 |
+
# ── builder helpers ───────────────────────────────────────────────────────
|
| 79 |
+
|
| 80 |
+
def add_stage(self, stanno: STANNO, frozen: bool = False) -> "CascadeSTANNO":
|
| 81 |
+
"""Append a stage in-place. Returns self for chaining."""
|
| 82 |
+
self.stages.append(stanno)
|
| 83 |
+
self.frozen.append(frozen)
|
| 84 |
+
return self
|
| 85 |
+
|
| 86 |
+
def freeze(self, stage: int) -> "CascadeSTANNO":
|
| 87 |
+
"""Mark stage as frozen (no weight updates). Returns self."""
|
| 88 |
+
self.frozen[stage] = True
|
| 89 |
+
return self
|
| 90 |
+
|
| 91 |
+
def unfreeze(self, stage: int) -> "CascadeSTANNO":
|
| 92 |
+
"""Allow weight updates for this stage. Returns self."""
|
| 93 |
+
self.frozen[stage] = False
|
| 94 |
+
return self
|
| 95 |
+
|
| 96 |
+
# ── forward pass ─────────────────────────────────────────────────────────
|
| 97 |
+
|
| 98 |
+
def predict(self, x: np.ndarray) -> np.ndarray:
|
| 99 |
+
"""Forward pass through all stages. Returns final output."""
|
| 100 |
+
h = np.asarray(x, dtype=np.float32)
|
| 101 |
+
for stage in self.stages:
|
| 102 |
+
h = stage.predict(h)
|
| 103 |
+
return h
|
| 104 |
+
|
| 105 |
+
def intermediate_output(self, x: np.ndarray, stage: int) -> np.ndarray:
|
| 106 |
+
"""Return the output of stage ``stage`` (0-indexed).
|
| 107 |
+
|
| 108 |
+
Useful for extracting learned representations at any depth.
|
| 109 |
+
"""
|
| 110 |
+
if stage < 0:
|
| 111 |
+
stage = len(self.stages) + stage
|
| 112 |
+
if not 0 <= stage < len(self.stages):
|
| 113 |
+
raise IndexError(
|
| 114 |
+
f"stage {stage} out of range for cascade with {len(self.stages)} stages"
|
| 115 |
+
)
|
| 116 |
+
h = np.asarray(x, dtype=np.float32)
|
| 117 |
+
for s in self.stages[: stage + 1]:
|
| 118 |
+
h = s.predict(h)
|
| 119 |
+
return h
|
| 120 |
+
|
| 121 |
+
def all_intermediate_outputs(self, x: np.ndarray) -> List[np.ndarray]:
|
| 122 |
+
"""Return outputs at every stage boundary (including input)."""
|
| 123 |
+
outputs = [np.asarray(x, dtype=np.float32)]
|
| 124 |
+
h = outputs[0]
|
| 125 |
+
for stage in self.stages:
|
| 126 |
+
h = stage.predict(h)
|
| 127 |
+
outputs.append(h)
|
| 128 |
+
return outputs
|
| 129 |
+
|
| 130 |
+
# ── training ──────────────────────────────────────────���──────────────────
|
| 131 |
+
|
| 132 |
+
def fit(
|
| 133 |
+
self,
|
| 134 |
+
x: np.ndarray,
|
| 135 |
+
y: np.ndarray,
|
| 136 |
+
epochs: int = 100,
|
| 137 |
+
batch_size: int = 32,
|
| 138 |
+
shuffle: bool = True,
|
| 139 |
+
patience: int = 20,
|
| 140 |
+
divergence_threshold: float = 100.0,
|
| 141 |
+
log_every: int = 10,
|
| 142 |
+
callback=None,
|
| 143 |
+
) -> List[float]:
|
| 144 |
+
"""
|
| 145 |
+
End-to-end training with gradient cascade.
|
| 146 |
+
|
| 147 |
+
Gradient of the output loss flows backward from the last stage to
|
| 148 |
+
the first using ``compute_cascade_updates()``. Frozen stages skip
|
| 149 |
+
weight updates but still propagate gradients.
|
| 150 |
+
|
| 151 |
+
Parameters
|
| 152 |
+
----------
|
| 153 |
+
x, y : ndarray (N, input_dim) and (N, output_dim)
|
| 154 |
+
epochs : int
|
| 155 |
+
batch_size : int
|
| 156 |
+
shuffle : bool
|
| 157 |
+
patience : int — epochs without improvement before early stopping
|
| 158 |
+
divergence_threshold : float
|
| 159 |
+
log_every : int — print loss every N epochs (0 = silent)
|
| 160 |
+
callback : callable(epoch, loss), optional
|
| 161 |
+
|
| 162 |
+
Returns
|
| 163 |
+
-------
|
| 164 |
+
loss_history : List[float]
|
| 165 |
+
"""
|
| 166 |
+
x = np.asarray(x, dtype=np.float32)
|
| 167 |
+
y = np.asarray(y, dtype=np.float32)
|
| 168 |
+
N = x.shape[0]
|
| 169 |
+
best_loss = float("inf")
|
| 170 |
+
patience_counter = 0
|
| 171 |
+
|
| 172 |
+
for epoch in range(epochs):
|
| 173 |
+
idx = np.arange(N)
|
| 174 |
+
if shuffle:
|
| 175 |
+
np.random.shuffle(idx)
|
| 176 |
+
x_s, y_s = x[idx], y[idx]
|
| 177 |
+
|
| 178 |
+
epoch_losses: List[float] = []
|
| 179 |
+
for start in range(0, N, batch_size):
|
| 180 |
+
xb = x_s[start : start + batch_size]
|
| 181 |
+
yb = y_s[start : start + batch_size]
|
| 182 |
+
loss = self._endtoend_step(xb, yb)
|
| 183 |
+
epoch_losses.append(loss)
|
| 184 |
+
|
| 185 |
+
mean_loss = float(np.mean(epoch_losses))
|
| 186 |
+
self._loss_history.append(mean_loss)
|
| 187 |
+
|
| 188 |
+
if mean_loss > divergence_threshold:
|
| 189 |
+
print(
|
| 190 |
+
f" ⚠ CASCADE DIVERGENCE at epoch {epoch}: "
|
| 191 |
+
f"loss={mean_loss:.6f} > {divergence_threshold}. Halting."
|
| 192 |
+
)
|
| 193 |
+
break
|
| 194 |
+
|
| 195 |
+
if patience > 0:
|
| 196 |
+
if mean_loss < best_loss:
|
| 197 |
+
best_loss = mean_loss
|
| 198 |
+
patience_counter = 0
|
| 199 |
+
else:
|
| 200 |
+
patience_counter += 1
|
| 201 |
+
if patience_counter >= patience:
|
| 202 |
+
print(
|
| 203 |
+
f" ⓘ Cascade early stopping at epoch {epoch}: "
|
| 204 |
+
f"no improvement for {patience} epochs. "
|
| 205 |
+
f"Best loss: {best_loss:.6f}"
|
| 206 |
+
)
|
| 207 |
+
break
|
| 208 |
+
|
| 209 |
+
if log_every > 0 and epoch % log_every == 0:
|
| 210 |
+
print(f" [cascade] epoch {epoch:>5} loss={mean_loss:.6f}")
|
| 211 |
+
|
| 212 |
+
if callback is not None:
|
| 213 |
+
callback(epoch, mean_loss)
|
| 214 |
+
|
| 215 |
+
return list(self._loss_history)
|
| 216 |
+
|
| 217 |
+
def staged_fit(
|
| 218 |
+
self,
|
| 219 |
+
x: np.ndarray,
|
| 220 |
+
intermediate_targets: List[np.ndarray],
|
| 221 |
+
epochs: int = 100,
|
| 222 |
+
batch_size: int = 32,
|
| 223 |
+
patience: int = 20,
|
| 224 |
+
log_every: int = 10,
|
| 225 |
+
) -> List[List[float]]:
|
| 226 |
+
"""
|
| 227 |
+
Per-stage independent training.
|
| 228 |
+
|
| 229 |
+
``intermediate_targets[k]`` is the target for stage k. The last
|
| 230 |
+
entry is the final output target. Use when you have supervision at
|
| 231 |
+
every stage (multi-task / layerwise pre-training).
|
| 232 |
+
|
| 233 |
+
Returns
|
| 234 |
+
-------
|
| 235 |
+
histories : List[List[float]] — one loss history per stage
|
| 236 |
+
"""
|
| 237 |
+
if len(intermediate_targets) != len(self.stages):
|
| 238 |
+
raise ValueError(
|
| 239 |
+
f"Need one target per stage: got {len(intermediate_targets)} "
|
| 240 |
+
f"targets for {len(self.stages)} stages."
|
| 241 |
+
)
|
| 242 |
+
|
| 243 |
+
histories: List[List[float]] = []
|
| 244 |
+
h = np.asarray(x, dtype=np.float32)
|
| 245 |
+
|
| 246 |
+
for k, (stage, target) in enumerate(zip(self.stages, intermediate_targets)):
|
| 247 |
+
if self.frozen[k]:
|
| 248 |
+
# Still run forward to get output for the next stage
|
| 249 |
+
h = stage.predict(h)
|
| 250 |
+
histories.append([])
|
| 251 |
+
continue
|
| 252 |
+
|
| 253 |
+
target_k = np.asarray(target, dtype=np.float32)
|
| 254 |
+
|
| 255 |
+
def _log(epoch, loss, k=k, log_every=log_every):
|
| 256 |
+
if log_every > 0 and epoch % log_every == 0:
|
| 257 |
+
print(f" [cascade stage {k}] epoch {epoch:>5} loss={loss:.6f}")
|
| 258 |
+
|
| 259 |
+
stage.fit(
|
| 260 |
+
h, target_k,
|
| 261 |
+
epochs=epochs,
|
| 262 |
+
batch_size=batch_size,
|
| 263 |
+
patience=patience,
|
| 264 |
+
callback=_log,
|
| 265 |
+
)
|
| 266 |
+
histories.append(list(stage._loss_history))
|
| 267 |
+
h = stage.predict(h) # pass output to next stage
|
| 268 |
+
|
| 269 |
+
return histories
|
| 270 |
+
|
| 271 |
+
# ── internal ─────────────────────────────────────────────────────────���───
|
| 272 |
+
|
| 273 |
+
def _endtoend_step(self, x_batch: np.ndarray, y_batch: np.ndarray) -> float:
|
| 274 |
+
"""Single mini-batch end-to-end update pass."""
|
| 275 |
+
# ── Forward: collect states ────────────────────────────────────────
|
| 276 |
+
stage_states: List[Tuple] = [] # (h_in, y_pred, state)
|
| 277 |
+
h = x_batch
|
| 278 |
+
for stage in self.stages:
|
| 279 |
+
y_pred, state = stage.net.forward(h)
|
| 280 |
+
state.y_pred = y_pred
|
| 281 |
+
state.x_batch = h
|
| 282 |
+
state.loss_history = stage._loss_history[-100:]
|
| 283 |
+
stage_states.append((h, y_pred, state))
|
| 284 |
+
h = y_pred
|
| 285 |
+
|
| 286 |
+
# ── Backward: gradient cascade ────────────────────────────────────
|
| 287 |
+
output_delta: Optional[np.ndarray] = None # None → last stage uses y_batch
|
| 288 |
+
pending_updates: List[Optional[Tuple]] = [None] * len(self.stages)
|
| 289 |
+
|
| 290 |
+
for k in range(len(self.stages) - 1, -1, -1):
|
| 291 |
+
h_in, y_pred, state = stage_states[k]
|
| 292 |
+
trainer = self.stages[k].trainer
|
| 293 |
+
|
| 294 |
+
if k == len(self.stages) - 1:
|
| 295 |
+
# Last stage: set real target so trainer can compute delta
|
| 296 |
+
state.y_batch = y_batch
|
| 297 |
+
else:
|
| 298 |
+
# Earlier stage: inject a virtual target that makes the
|
| 299 |
+
# trainer reproduce the incoming output_delta.
|
| 300 |
+
# (2/N)(y_pred − y_virtual) = output_delta
|
| 301 |
+
# → y_virtual = y_pred − (N/2) * output_delta
|
| 302 |
+
batch_size = y_pred.shape[0]
|
| 303 |
+
if output_delta is not None:
|
| 304 |
+
state.y_batch = y_pred - (batch_size / 2.0) * output_delta
|
| 305 |
+
else:
|
| 306 |
+
state.y_batch = y_pred # zero error (should not happen)
|
| 307 |
+
|
| 308 |
+
dW, db, input_grad = trainer.compute_cascade_updates(
|
| 309 |
+
state, output_delta=output_delta
|
| 310 |
+
)
|
| 311 |
+
pending_updates[k] = (dW, db)
|
| 312 |
+
output_delta = input_grad # flow backward to previous stage
|
| 313 |
+
|
| 314 |
+
# ── Apply updates to non-frozen stages ────────────────────────────
|
| 315 |
+
for k, upd in enumerate(pending_updates):
|
| 316 |
+
if upd is not None and not self.frozen[k]:
|
| 317 |
+
dW, db = upd
|
| 318 |
+
self.stages[k].net.apply_updates(dW, db)
|
| 319 |
+
|
| 320 |
+
# Final output loss for monitoring
|
| 321 |
+
final_pred = stage_states[-1][1]
|
| 322 |
+
return float(np.mean((final_pred - y_batch) ** 2))
|
| 323 |
+
|
| 324 |
+
# ── persistence ──────────────────────────────────────────────────────────
|
| 325 |
+
|
| 326 |
+
def save(self, path: str) -> None:
|
| 327 |
+
"""Pickle the full cascade to disk."""
|
| 328 |
+
with open(path, "wb") as f:
|
| 329 |
+
pickle.dump(self, f)
|
| 330 |
+
|
| 331 |
+
@classmethod
|
| 332 |
+
def load(cls, path: str) -> "CascadeSTANNO":
|
| 333 |
+
"""Load a CascadeSTANNO from disk."""
|
| 334 |
+
with open(path, "rb") as f:
|
| 335 |
+
obj = pickle.load(f)
|
| 336 |
+
if not isinstance(obj, cls):
|
| 337 |
+
raise TypeError(f"Expected CascadeSTANNO, got {type(obj)}")
|
| 338 |
+
return obj
|
| 339 |
+
|
| 340 |
+
# ── introspection ─────────────────────────────────────────────────────────
|
| 341 |
+
|
| 342 |
+
def __len__(self) -> int:
|
| 343 |
+
return len(self.stages)
|
| 344 |
+
|
| 345 |
+
def __getitem__(self, idx: int) -> STANNO:
|
| 346 |
+
return self.stages[idx]
|
| 347 |
+
|
| 348 |
+
def __repr__(self) -> str:
|
| 349 |
+
stage_strs = []
|
| 350 |
+
for i, (s, f) in enumerate(zip(self.stages, self.frozen)):
|
| 351 |
+
layers = "→".join(str(d) for d in s.config.layers)
|
| 352 |
+
tag = " [frozen]" if f else ""
|
| 353 |
+
stage_strs.append(f" {i}: [{layers}]{tag}")
|
| 354 |
+
return "CascadeSTANNO(\n" + "\n".join(stage_strs) + "\n)"
|
stanno/integration/continual.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
ContinualSTANNO — online / continual learning wrapper.
|
| 3 |
+
|
| 4 |
+
Provides a one-sample-at-a-time observe() API and a held-out test_loss()
|
| 5 |
+
evaluator for monitoring concept drift, both powered by the same
|
| 6 |
+
STANNO._trainer_step() machinery.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
from __future__ import annotations
|
| 10 |
+
from typing import List, Optional, Tuple
|
| 11 |
+
|
| 12 |
+
import numpy as np
|
| 13 |
+
|
| 14 |
+
from stanno.core.stanno import STANNO
|
| 15 |
+
from stanno.config.schema import STANNOConfig
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class ContinualSTANNO:
|
| 19 |
+
"""
|
| 20 |
+
Thin wrapper enabling online / continual learning on a STANNO.
|
| 21 |
+
|
| 22 |
+
Parameters
|
| 23 |
+
----------
|
| 24 |
+
stanno : STANNO
|
| 25 |
+
A (possibly pre-trained) STANNO instance.
|
| 26 |
+
"""
|
| 27 |
+
|
| 28 |
+
def __init__(self, stanno: STANNO) -> None:
|
| 29 |
+
self.stanno = stanno
|
| 30 |
+
self._step_count: int = 0
|
| 31 |
+
self._recent_losses: List[float] = []
|
| 32 |
+
|
| 33 |
+
# ── online update ─────────────────────────────────────────────────────────
|
| 34 |
+
|
| 35 |
+
def observe(
|
| 36 |
+
self,
|
| 37 |
+
x: np.ndarray,
|
| 38 |
+
y_true: np.ndarray,
|
| 39 |
+
) -> float:
|
| 40 |
+
"""
|
| 41 |
+
One online update step.
|
| 42 |
+
|
| 43 |
+
Parameters
|
| 44 |
+
----------
|
| 45 |
+
x : array (1, input_dim) or (input_dim,)
|
| 46 |
+
y_true : array (1, output_dim) or (output_dim,)
|
| 47 |
+
|
| 48 |
+
Returns
|
| 49 |
+
-------
|
| 50 |
+
loss : float — MSE on this sample before the update.
|
| 51 |
+
"""
|
| 52 |
+
x = np.asarray(x, dtype=np.float32).reshape(1, -1)
|
| 53 |
+
y = np.asarray(y_true, dtype=np.float32).reshape(1, -1)
|
| 54 |
+
loss = self.stanno._trainer_step(x, y)
|
| 55 |
+
self._step_count += 1
|
| 56 |
+
self._recent_losses.append(loss)
|
| 57 |
+
if len(self._recent_losses) > 1000:
|
| 58 |
+
self._recent_losses.pop(0)
|
| 59 |
+
return loss
|
| 60 |
+
|
| 61 |
+
# ── evaluation ────────────────────────────────────────────────────────────
|
| 62 |
+
|
| 63 |
+
def test_loss(
|
| 64 |
+
self,
|
| 65 |
+
x_test: np.ndarray,
|
| 66 |
+
y_test: np.ndarray,
|
| 67 |
+
batch_size: int = 256,
|
| 68 |
+
) -> float:
|
| 69 |
+
"""
|
| 70 |
+
Compute mean MSE on held-out data without updating any weights.
|
| 71 |
+
|
| 72 |
+
Parameters
|
| 73 |
+
----------
|
| 74 |
+
x_test, y_test : arrays of shape (N, *)
|
| 75 |
+
batch_size : int — avoids OOM for large arrays.
|
| 76 |
+
|
| 77 |
+
Returns
|
| 78 |
+
-------
|
| 79 |
+
mean MSE : float
|
| 80 |
+
"""
|
| 81 |
+
x_test = np.asarray(x_test, dtype=np.float32)
|
| 82 |
+
y_test = np.asarray(y_test, dtype=np.float32)
|
| 83 |
+
losses: List[float] = []
|
| 84 |
+
for start in range(0, len(x_test), batch_size):
|
| 85 |
+
xb = x_test[start: start + batch_size]
|
| 86 |
+
yb = y_test[start: start + batch_size]
|
| 87 |
+
y_pred = self.stanno.predict(xb)
|
| 88 |
+
losses.append(float(np.mean((y_pred - yb) ** 2)))
|
| 89 |
+
return float(np.mean(losses))
|
| 90 |
+
|
| 91 |
+
# ── convenience ───────────────────────────────────────────────────────────
|
| 92 |
+
|
| 93 |
+
@property
|
| 94 |
+
def steps(self) -> int:
|
| 95 |
+
"""Number of observe() calls so far."""
|
| 96 |
+
return self._step_count
|
| 97 |
+
|
| 98 |
+
@property
|
| 99 |
+
def recent_loss(self) -> float:
|
| 100 |
+
"""Mean loss over the last ≤1000 steps."""
|
| 101 |
+
if not self._recent_losses:
|
| 102 |
+
return float("nan")
|
| 103 |
+
return float(np.mean(self._recent_losses))
|
| 104 |
+
|
| 105 |
+
def __repr__(self) -> str:
|
| 106 |
+
return (
|
| 107 |
+
f"ContinualSTANNO(steps={self._step_count}, "
|
| 108 |
+
f"recent_loss={self.recent_loss:.5f})"
|
| 109 |
+
)
|
stanno/integration/dsanno.py
ADDED
|
@@ -0,0 +1,389 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
DSANNO — Data Scanning Artificial Neural Network Object.
|
| 3 |
+
|
| 4 |
+
Implements the DSANNO concept from US5852815A (Thaler, 1998):
|
| 5 |
+
|
| 6 |
+
"A DSANNO scans large regions of the data space, looking for patterns
|
| 7 |
+
or relationships that match its learned internal representations."
|
| 8 |
+
|
| 9 |
+
Modern reinterpretation
|
| 10 |
+
───────────────────────
|
| 11 |
+
In the patent, "scanning the data space" meant traversing spreadsheet cell
|
| 12 |
+
ranges. In a modern context the data space is a NumPy array, a pandas
|
| 13 |
+
DataFrame, or any stream of observations.
|
| 14 |
+
|
| 15 |
+
DSANNO wraps a trained STANNO and turns it into a *semantic scanner*:
|
| 16 |
+
|
| 17 |
+
• Low reconstruction error → the STANNO recognises this row (good match)
|
| 18 |
+
• High reconstruction error → the row is outside the STANNO's domain
|
| 19 |
+
|
| 20 |
+
Two canonical use-cases:
|
| 21 |
+
|
| 22 |
+
1. **Reconstruction / autoencoder mode**
|
| 23 |
+
The STANNO is trained to reconstruct its own input (layers[-1] == layers[0]).
|
| 24 |
+
DSANNO scores rows by ``||x - STANNO(x)||²``.
|
| 25 |
+
|
| 26 |
+
2. **Regression / classifier mode**
|
| 27 |
+
The STANNO is trained on (x, y_known). DSANNO scores rows by
|
| 28 |
+
``||y_known - STANNO(x)||²``. Rows with low score are those the
|
| 29 |
+
model "knows about".
|
| 30 |
+
|
| 31 |
+
Applications
|
| 32 |
+
────────────
|
| 33 |
+
- Dataset curation: find which rows of a large table match the model's domain.
|
| 34 |
+
- Anomaly stream scanning: flag rows that are outside the STANNO's distribution.
|
| 35 |
+
- Semantic retrieval: return the top-k rows most similar to what the model learned.
|
| 36 |
+
- Cascaded filtering: chain DSANNOs to progressively narrow a large dataset.
|
| 37 |
+
"""
|
| 38 |
+
|
| 39 |
+
from __future__ import annotations
|
| 40 |
+
|
| 41 |
+
from dataclasses import dataclass, field
|
| 42 |
+
from typing import Generator, Iterable, List, Optional, Tuple
|
| 43 |
+
|
| 44 |
+
import numpy as np
|
| 45 |
+
|
| 46 |
+
from stanno.core.stanno import STANNO
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
# ─── result type ─────────────────────────────────────────────────────────────
|
| 50 |
+
|
| 51 |
+
@dataclass
|
| 52 |
+
class ScanResult:
|
| 53 |
+
"""
|
| 54 |
+
Result of a DSANNO scan over a dataset.
|
| 55 |
+
|
| 56 |
+
Attributes
|
| 57 |
+
----------
|
| 58 |
+
indices : ndarray (N,)
|
| 59 |
+
Original row indices (0-based) of the scanned data.
|
| 60 |
+
scores : ndarray (N,)
|
| 61 |
+
Reconstruction / prediction error per row (lower = better match).
|
| 62 |
+
predictions : ndarray (N, output_dim)
|
| 63 |
+
Raw STANNO output for each row.
|
| 64 |
+
threshold : float or None
|
| 65 |
+
The threshold used to generate ``matched_mask`` (if set_threshold was
|
| 66 |
+
called or a threshold was passed to scan()).
|
| 67 |
+
matched_mask : ndarray (N,) bool or None
|
| 68 |
+
True for rows where score <= threshold.
|
| 69 |
+
"""
|
| 70 |
+
|
| 71 |
+
indices: np.ndarray
|
| 72 |
+
scores: np.ndarray
|
| 73 |
+
predictions: np.ndarray
|
| 74 |
+
threshold: Optional[float] = None
|
| 75 |
+
matched_mask: Optional[np.ndarray] = None
|
| 76 |
+
|
| 77 |
+
# ── convenience accessors ────────────────────────────────────────────────
|
| 78 |
+
|
| 79 |
+
def matched_indices(self) -> np.ndarray:
|
| 80 |
+
"""Indices of rows that passed the match threshold."""
|
| 81 |
+
if self.matched_mask is None:
|
| 82 |
+
raise RuntimeError("No threshold set — call scan(threshold=…) or set_threshold()")
|
| 83 |
+
return self.indices[self.matched_mask]
|
| 84 |
+
|
| 85 |
+
def top_k_indices(self, k: int) -> np.ndarray:
|
| 86 |
+
"""Indices of the k lowest-score (best-match) rows."""
|
| 87 |
+
order = np.argsort(self.scores)
|
| 88 |
+
return self.indices[order[:k]]
|
| 89 |
+
|
| 90 |
+
def top_k(self, k: int) -> Tuple[np.ndarray, np.ndarray]:
|
| 91 |
+
"""Return (indices, scores) of the k best-matching rows."""
|
| 92 |
+
order = np.argsort(self.scores)[:k]
|
| 93 |
+
return self.indices[order], self.scores[order]
|
| 94 |
+
|
| 95 |
+
def set_threshold(self, threshold: float) -> "ScanResult":
|
| 96 |
+
"""Apply a threshold and populate matched_mask. Returns self."""
|
| 97 |
+
self.threshold = threshold
|
| 98 |
+
self.matched_mask = self.scores <= threshold
|
| 99 |
+
return self
|
| 100 |
+
|
| 101 |
+
def __len__(self) -> int:
|
| 102 |
+
return len(self.indices)
|
| 103 |
+
|
| 104 |
+
def __repr__(self) -> str:
|
| 105 |
+
n = len(self.indices)
|
| 106 |
+
mn = f"{self.scores.min():.4f}" if n else "—"
|
| 107 |
+
mx = f"{self.scores.max():.4f}" if n else "—"
|
| 108 |
+
matched = (
|
| 109 |
+
f" matched={self.matched_mask.sum()}" if self.matched_mask is not None else ""
|
| 110 |
+
)
|
| 111 |
+
return f"ScanResult(n={n}, score_range=[{mn}, {mx}]{matched})"
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
# ─── DSANNO ──────────────────────────────────────────────────────────────────
|
| 115 |
+
|
| 116 |
+
class DSANNO:
|
| 117 |
+
"""
|
| 118 |
+
Data Scanning Artificial Neural Network Object.
|
| 119 |
+
|
| 120 |
+
Wraps a trained STANNO and scans datasets to find rows that match its
|
| 121 |
+
learned representation.
|
| 122 |
+
|
| 123 |
+
Parameters
|
| 124 |
+
----------
|
| 125 |
+
stanno : STANNO
|
| 126 |
+
A trained STANNO instance.
|
| 127 |
+
mode : {"reconstruction", "prediction"}
|
| 128 |
+
Scoring mode:
|
| 129 |
+
- ``"reconstruction"`` — compare STANNO(x) to x itself.
|
| 130 |
+
Requires output_dim == input_dim.
|
| 131 |
+
- ``"prediction"`` — compare STANNO(x) to an externally provided y.
|
| 132 |
+
Pass y_known to scan() / score_batch().
|
| 133 |
+
score_fn : callable(y_pred, y_ref) → float, optional
|
| 134 |
+
Custom per-row scoring function. Default: mean squared error.
|
| 135 |
+
|
| 136 |
+
Example
|
| 137 |
+
-------
|
| 138 |
+
>>> # Reconstruction scanner (autoencoder mode)
|
| 139 |
+
>>> ae = STANNO(STANNOConfig(layers=[16, 8, 16]))
|
| 140 |
+
>>> ae.fit(x_normal, x_normal)
|
| 141 |
+
>>> scanner = DSANNO(ae, mode="reconstruction")
|
| 142 |
+
>>> result = scanner.scan(x_large_dataset, threshold=0.05)
|
| 143 |
+
>>> normal_rows = result.matched_indices()
|
| 144 |
+
"""
|
| 145 |
+
|
| 146 |
+
def __init__(
|
| 147 |
+
self,
|
| 148 |
+
stanno: STANNO,
|
| 149 |
+
mode: str = "reconstruction",
|
| 150 |
+
score_fn=None,
|
| 151 |
+
) -> None:
|
| 152 |
+
if mode not in ("reconstruction", "prediction"):
|
| 153 |
+
raise ValueError(
|
| 154 |
+
f"mode must be 'reconstruction' or 'prediction', got {mode!r}"
|
| 155 |
+
)
|
| 156 |
+
self.stanno = stanno
|
| 157 |
+
self.mode = mode
|
| 158 |
+
self._score_fn = score_fn or self._default_score
|
| 159 |
+
|
| 160 |
+
# ── core scoring ─────────────────────────────────────────────────────────
|
| 161 |
+
|
| 162 |
+
@staticmethod
|
| 163 |
+
def _default_score(y_pred: np.ndarray, y_ref: np.ndarray) -> np.ndarray:
|
| 164 |
+
"""Mean squared error per row — shape (N,)."""
|
| 165 |
+
return np.mean((y_pred - y_ref) ** 2, axis=1)
|
| 166 |
+
|
| 167 |
+
def score_row(
|
| 168 |
+
self,
|
| 169 |
+
x: np.ndarray,
|
| 170 |
+
y_ref: Optional[np.ndarray] = None,
|
| 171 |
+
) -> float:
|
| 172 |
+
"""
|
| 173 |
+
Score a single row.
|
| 174 |
+
|
| 175 |
+
Parameters
|
| 176 |
+
----------
|
| 177 |
+
x : array (input_dim,) or (1, input_dim)
|
| 178 |
+
y_ref : array (output_dim,) or (1, output_dim), optional
|
| 179 |
+
Required in 'prediction' mode.
|
| 180 |
+
|
| 181 |
+
Returns
|
| 182 |
+
-------
|
| 183 |
+
score : float — lower means the STANNO recognises this row.
|
| 184 |
+
"""
|
| 185 |
+
x = np.asarray(x, dtype=np.float32).reshape(1, -1)
|
| 186 |
+
y_pred = self.stanno.predict(x)
|
| 187 |
+
if self.mode == "reconstruction":
|
| 188 |
+
y_ref_arr = x
|
| 189 |
+
else:
|
| 190 |
+
if y_ref is None:
|
| 191 |
+
raise ValueError("y_ref required in 'prediction' mode")
|
| 192 |
+
y_ref_arr = np.asarray(y_ref, dtype=np.float32).reshape(1, -1)
|
| 193 |
+
return float(self._score_fn(y_pred, y_ref_arr)[0])
|
| 194 |
+
|
| 195 |
+
def score_batch(
|
| 196 |
+
self,
|
| 197 |
+
x: np.ndarray,
|
| 198 |
+
y_ref: Optional[np.ndarray] = None,
|
| 199 |
+
batch_size: int = 256,
|
| 200 |
+
) -> np.ndarray:
|
| 201 |
+
"""
|
| 202 |
+
Score an array of rows efficiently in batches.
|
| 203 |
+
|
| 204 |
+
Parameters
|
| 205 |
+
----------
|
| 206 |
+
x : ndarray (N, input_dim)
|
| 207 |
+
y_ref : ndarray (N, output_dim), optional — required for 'prediction' mode
|
| 208 |
+
batch_size : int
|
| 209 |
+
|
| 210 |
+
Returns
|
| 211 |
+
-------
|
| 212 |
+
scores : ndarray (N,) — per-row reconstruction / prediction error
|
| 213 |
+
"""
|
| 214 |
+
x = np.asarray(x, dtype=np.float32)
|
| 215 |
+
N = x.shape[0]
|
| 216 |
+
all_scores: List[np.ndarray] = []
|
| 217 |
+
all_preds: List[np.ndarray] = []
|
| 218 |
+
|
| 219 |
+
for start in range(0, N, batch_size):
|
| 220 |
+
xb = x[start : start + batch_size]
|
| 221 |
+
y_pred = self.stanno.predict(xb)
|
| 222 |
+
|
| 223 |
+
if self.mode == "reconstruction":
|
| 224 |
+
ref = xb
|
| 225 |
+
else:
|
| 226 |
+
if y_ref is None:
|
| 227 |
+
raise ValueError("y_ref required in 'prediction' mode")
|
| 228 |
+
ref = np.asarray(y_ref[start : start + batch_size], dtype=np.float32)
|
| 229 |
+
|
| 230 |
+
all_scores.append(self._score_fn(y_pred, ref))
|
| 231 |
+
all_preds.append(y_pred)
|
| 232 |
+
|
| 233 |
+
return np.concatenate(all_scores), np.concatenate(all_preds)
|
| 234 |
+
|
| 235 |
+
# ── scan ─────────────────────────────────────────────────────────────────
|
| 236 |
+
|
| 237 |
+
def scan(
|
| 238 |
+
self,
|
| 239 |
+
x: np.ndarray,
|
| 240 |
+
y_ref: Optional[np.ndarray] = None,
|
| 241 |
+
threshold: Optional[float] = None,
|
| 242 |
+
batch_size: int = 256,
|
| 243 |
+
) -> ScanResult:
|
| 244 |
+
"""
|
| 245 |
+
Scan a dataset and return a ScanResult.
|
| 246 |
+
|
| 247 |
+
Parameters
|
| 248 |
+
----------
|
| 249 |
+
x : ndarray (N, input_dim)
|
| 250 |
+
y_ref : ndarray (N, output_dim), optional — required for 'prediction' mode
|
| 251 |
+
threshold : float, optional — rows with score <= threshold are flagged
|
| 252 |
+
batch_size : int
|
| 253 |
+
|
| 254 |
+
Returns
|
| 255 |
+
-------
|
| 256 |
+
ScanResult
|
| 257 |
+
"""
|
| 258 |
+
scores, preds = self.score_batch(x, y_ref=y_ref, batch_size=batch_size)
|
| 259 |
+
indices = np.arange(len(scores))
|
| 260 |
+
result = ScanResult(
|
| 261 |
+
indices=indices,
|
| 262 |
+
scores=scores,
|
| 263 |
+
predictions=preds,
|
| 264 |
+
)
|
| 265 |
+
if threshold is not None:
|
| 266 |
+
result.set_threshold(threshold)
|
| 267 |
+
return result
|
| 268 |
+
|
| 269 |
+
def top_k(
|
| 270 |
+
self,
|
| 271 |
+
x: np.ndarray,
|
| 272 |
+
k: int = 10,
|
| 273 |
+
y_ref: Optional[np.ndarray] = None,
|
| 274 |
+
batch_size: int = 256,
|
| 275 |
+
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
|
| 276 |
+
"""
|
| 277 |
+
Return the k rows that best match the STANNO's learned representation.
|
| 278 |
+
|
| 279 |
+
Returns
|
| 280 |
+
-------
|
| 281 |
+
indices : ndarray (k,) — original row indices
|
| 282 |
+
scores : ndarray (k,) — lowest error first
|
| 283 |
+
preds : ndarray (k, output_dim)
|
| 284 |
+
"""
|
| 285 |
+
scores, preds = self.score_batch(x, y_ref=y_ref, batch_size=batch_size)
|
| 286 |
+
order = np.argsort(scores)[:k]
|
| 287 |
+
return order, scores[order], preds[order]
|
| 288 |
+
|
| 289 |
+
def find_matches(
|
| 290 |
+
self,
|
| 291 |
+
x: np.ndarray,
|
| 292 |
+
threshold: float,
|
| 293 |
+
y_ref: Optional[np.ndarray] = None,
|
| 294 |
+
batch_size: int = 256,
|
| 295 |
+
) -> np.ndarray:
|
| 296 |
+
"""
|
| 297 |
+
Return a boolean mask of rows whose score is at or below threshold.
|
| 298 |
+
|
| 299 |
+
Parameters
|
| 300 |
+
----------
|
| 301 |
+
x : ndarray (N, input_dim)
|
| 302 |
+
threshold : float — score cutoff (inclusive)
|
| 303 |
+
y_ref : ndarray (N, output_dim), optional
|
| 304 |
+
batch_size : int
|
| 305 |
+
|
| 306 |
+
Returns
|
| 307 |
+
-------
|
| 308 |
+
mask : ndarray (N,) bool
|
| 309 |
+
"""
|
| 310 |
+
scores, _ = self.score_batch(x, y_ref=y_ref, batch_size=batch_size)
|
| 311 |
+
return scores <= threshold
|
| 312 |
+
|
| 313 |
+
# ── streaming scan ───────────────────────────────────────────────────────
|
| 314 |
+
|
| 315 |
+
def scan_stream(
|
| 316 |
+
self,
|
| 317 |
+
batches: Iterable[np.ndarray],
|
| 318 |
+
threshold: Optional[float] = None,
|
| 319 |
+
y_ref_batches: Optional[Iterable[np.ndarray]] = None,
|
| 320 |
+
) -> Generator[ScanResult, None, None]:
|
| 321 |
+
"""
|
| 322 |
+
Scan an iterable of batches lazily — suitable for large files or live feeds.
|
| 323 |
+
|
| 324 |
+
Parameters
|
| 325 |
+
----------
|
| 326 |
+
batches : iterable of ndarray (batch, input_dim)
|
| 327 |
+
threshold : float, optional
|
| 328 |
+
y_ref_batches : iterable of ndarray (batch, output_dim), optional
|
| 329 |
+
|
| 330 |
+
Yields
|
| 331 |
+
------
|
| 332 |
+
ScanResult for each incoming batch (with offset indices)
|
| 333 |
+
"""
|
| 334 |
+
offset = 0
|
| 335 |
+
ref_iter = iter(y_ref_batches) if y_ref_batches is not None else None
|
| 336 |
+
|
| 337 |
+
for xb in batches:
|
| 338 |
+
xb = np.asarray(xb, dtype=np.float32)
|
| 339 |
+
y_ref_b = next(ref_iter) if ref_iter is not None else None
|
| 340 |
+
y_pred = self.stanno.predict(xb)
|
| 341 |
+
|
| 342 |
+
if self.mode == "reconstruction":
|
| 343 |
+
ref = xb
|
| 344 |
+
else:
|
| 345 |
+
if y_ref_b is None:
|
| 346 |
+
raise ValueError("y_ref_batches required in 'prediction' mode")
|
| 347 |
+
ref = np.asarray(y_ref_b, dtype=np.float32)
|
| 348 |
+
|
| 349 |
+
scores = self._score_fn(y_pred, ref)
|
| 350 |
+
indices = np.arange(offset, offset + len(xb))
|
| 351 |
+
result = ScanResult(indices=indices, scores=scores, predictions=y_pred)
|
| 352 |
+
if threshold is not None:
|
| 353 |
+
result.set_threshold(threshold)
|
| 354 |
+
yield result
|
| 355 |
+
offset += len(xb)
|
| 356 |
+
|
| 357 |
+
# ── calibration ──────────────────────────────────────────────────────────
|
| 358 |
+
|
| 359 |
+
def calibrate_threshold(
|
| 360 |
+
self,
|
| 361 |
+
x_known: np.ndarray,
|
| 362 |
+
percentile: float = 95.0,
|
| 363 |
+
y_ref: Optional[np.ndarray] = None,
|
| 364 |
+
batch_size: int = 256,
|
| 365 |
+
) -> float:
|
| 366 |
+
"""
|
| 367 |
+
Estimate a threshold from known in-distribution data.
|
| 368 |
+
|
| 369 |
+
Scores ``x_known`` and returns the value at ``percentile``-th percentile.
|
| 370 |
+
Use the result as the ``threshold`` in subsequent scan() calls.
|
| 371 |
+
|
| 372 |
+
Parameters
|
| 373 |
+
----------
|
| 374 |
+
x_known : ndarray (N, input_dim) — in-distribution examples
|
| 375 |
+
percentile : float — e.g. 95 means "flag the 5% worst reconstructions"
|
| 376 |
+
y_ref : ndarray (N, output_dim), optional — required for 'prediction' mode
|
| 377 |
+
|
| 378 |
+
Returns
|
| 379 |
+
-------
|
| 380 |
+
threshold : float
|
| 381 |
+
"""
|
| 382 |
+
scores, _ = self.score_batch(x_known, y_ref=y_ref, batch_size=batch_size)
|
| 383 |
+
return float(np.percentile(scores, percentile))
|
| 384 |
+
|
| 385 |
+
# ── repr ─────────────────────────────────────────────────────────────────
|
| 386 |
+
|
| 387 |
+
def __repr__(self) -> str:
|
| 388 |
+
layers = "→".join(str(d) for d in self.stanno.config.layers)
|
| 389 |
+
return f"DSANNO(mode={self.mode!r}, stanno=[{layers}])"
|
stanno/integration/filter.py
ADDED
|
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
STANNOFilter — anomaly-based pre-classifier / gating layer for LLM pipelines.
|
| 3 |
+
|
| 4 |
+
How it works
|
| 5 |
+
────────────
|
| 6 |
+
1. Encode the incoming prompt/context as a numeric vector x (caller's job).
|
| 7 |
+
2. Run STANNO.predict(x) and compare to the expected reconstruction y_true.
|
| 8 |
+
3. The MSE of this comparison is the "anomaly score".
|
| 9 |
+
4. Normalise to [0, 1] using the training MSE baseline.
|
| 10 |
+
5. If score > threshold AND block_above_threshold is True, raise an exception
|
| 11 |
+
(or return a blocked sentinel) instead of forwarding to the LLM.
|
| 12 |
+
|
| 13 |
+
For a typical autoencoder use-case (embedding reconstruction), a high MSE
|
| 14 |
+
means the input is out-of-distribution — the STANNO never learned it.
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
from __future__ import annotations
|
| 18 |
+
from typing import Any, Dict, List, Optional, Tuple
|
| 19 |
+
|
| 20 |
+
import numpy as np
|
| 21 |
+
|
| 22 |
+
from stanno.config.schema import FilterConfig
|
| 23 |
+
from stanno.core.stanno import STANNO
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class FilteredRequestError(Exception):
|
| 27 |
+
"""Raised when a request is blocked by the STANNO anomaly filter."""
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class STANNOFilter:
|
| 31 |
+
"""
|
| 32 |
+
Anomaly-based pre-filter / pre-classifier for LLM pipelines.
|
| 33 |
+
|
| 34 |
+
Parameters
|
| 35 |
+
----------
|
| 36 |
+
stanno : STANNO
|
| 37 |
+
A trained STANNO instance used for anomaly scoring.
|
| 38 |
+
filter_config : FilterConfig
|
| 39 |
+
Threshold and blocking configuration.
|
| 40 |
+
llm_client : LLMClient, optional
|
| 41 |
+
If provided, filter_and_send() will call it after the check.
|
| 42 |
+
"""
|
| 43 |
+
|
| 44 |
+
def __init__(
|
| 45 |
+
self,
|
| 46 |
+
stanno: STANNO,
|
| 47 |
+
filter_config: FilterConfig,
|
| 48 |
+
llm_client=None,
|
| 49 |
+
) -> None:
|
| 50 |
+
self.stanno = stanno
|
| 51 |
+
self.config = filter_config
|
| 52 |
+
self.llm_client = llm_client
|
| 53 |
+
|
| 54 |
+
# ── scoring ───────────────────────────────────────────────────────────────
|
| 55 |
+
|
| 56 |
+
def score(
|
| 57 |
+
self,
|
| 58 |
+
x: np.ndarray,
|
| 59 |
+
y_true: Optional[np.ndarray] = None,
|
| 60 |
+
) -> Tuple[float, Dict[str, Any]]:
|
| 61 |
+
"""
|
| 62 |
+
Compute an anomaly score for input x.
|
| 63 |
+
|
| 64 |
+
If y_true is None the STANNO output is compared against x itself
|
| 65 |
+
(autoencoder mode: expects input_dim == output_dim, or the caller
|
| 66 |
+
passes the expected reconstruction explicitly).
|
| 67 |
+
|
| 68 |
+
Returns
|
| 69 |
+
-------
|
| 70 |
+
score : float in [0.0, 1.0]
|
| 71 |
+
metadata : dict with raw_mse, norm_baseline, threshold, blocked
|
| 72 |
+
"""
|
| 73 |
+
x = np.asarray(x, dtype=np.float32)
|
| 74 |
+
if x.ndim == 1:
|
| 75 |
+
x = x.reshape(1, -1)
|
| 76 |
+
|
| 77 |
+
y_pred = self.stanno.predict(x)
|
| 78 |
+
|
| 79 |
+
if y_true is None:
|
| 80 |
+
# autoencoder: reconstruct x
|
| 81 |
+
if self.stanno.net.output_dim == self.stanno.net.input_dim:
|
| 82 |
+
y_true = x
|
| 83 |
+
else:
|
| 84 |
+
raise ValueError(
|
| 85 |
+
"y_true must be provided when output_dim != input_dim."
|
| 86 |
+
)
|
| 87 |
+
else:
|
| 88 |
+
y_true = np.asarray(y_true, dtype=np.float32)
|
| 89 |
+
if y_true.ndim == 1:
|
| 90 |
+
y_true = y_true.reshape(1, -1)
|
| 91 |
+
|
| 92 |
+
raw_mse = float(np.mean((y_pred - y_true) ** 2))
|
| 93 |
+
|
| 94 |
+
# Normalise: use training MSE baseline if available
|
| 95 |
+
baseline = self.stanno._train_mse_norm or 1.0
|
| 96 |
+
score = float(min(1.0, raw_mse / max(baseline, 1e-8)))
|
| 97 |
+
|
| 98 |
+
metadata = {
|
| 99 |
+
"raw_mse": raw_mse,
|
| 100 |
+
"norm_baseline": baseline,
|
| 101 |
+
"threshold": self.config.anomaly_threshold,
|
| 102 |
+
"score": score,
|
| 103 |
+
"blocked": score > self.config.anomaly_threshold
|
| 104 |
+
and self.config.block_above_threshold,
|
| 105 |
+
}
|
| 106 |
+
return score, metadata
|
| 107 |
+
|
| 108 |
+
# ── filter + forward ──────────────────────────────────────────────────────
|
| 109 |
+
|
| 110 |
+
def filter_and_send(
|
| 111 |
+
self,
|
| 112 |
+
messages: List[Dict[str, str]],
|
| 113 |
+
x: np.ndarray,
|
| 114 |
+
y_true: Optional[np.ndarray] = None,
|
| 115 |
+
**llm_kwargs: Any,
|
| 116 |
+
) -> Dict[str, Any]:
|
| 117 |
+
"""
|
| 118 |
+
Score x, optionally block the request, then send to the LLM.
|
| 119 |
+
|
| 120 |
+
Parameters
|
| 121 |
+
----------
|
| 122 |
+
messages : list of {"role": ..., "content": ...}
|
| 123 |
+
x : numeric representation of the input (embedding, features, etc.)
|
| 124 |
+
y_true : expected reconstruction, or None for autoencoder mode
|
| 125 |
+
**llm_kwargs : forwarded to llm_client.chat_completion()
|
| 126 |
+
|
| 127 |
+
Returns
|
| 128 |
+
-------
|
| 129 |
+
LLM response dict, enriched with "stanno_filter" metadata key.
|
| 130 |
+
|
| 131 |
+
Raises
|
| 132 |
+
------
|
| 133 |
+
FilteredRequestError
|
| 134 |
+
When score > threshold and block_above_threshold is True.
|
| 135 |
+
"""
|
| 136 |
+
score, metadata = self.score(x, y_true)
|
| 137 |
+
|
| 138 |
+
if metadata["blocked"]:
|
| 139 |
+
raise FilteredRequestError(
|
| 140 |
+
f"Request blocked by STANNO filter: "
|
| 141 |
+
f"anomaly_score={score:.4f} > threshold={self.config.anomaly_threshold:.4f}"
|
| 142 |
+
)
|
| 143 |
+
|
| 144 |
+
if self.llm_client is None:
|
| 145 |
+
raise RuntimeError(
|
| 146 |
+
"No llm_client configured on this STANNOFilter. "
|
| 147 |
+
"Pass one at construction time."
|
| 148 |
+
)
|
| 149 |
+
|
| 150 |
+
response = self.llm_client.chat_completion(messages, **llm_kwargs)
|
| 151 |
+
response[self.config.metadata_field] = metadata
|
| 152 |
+
return response
|
| 153 |
+
|
| 154 |
+
def __repr__(self) -> str:
|
| 155 |
+
return (
|
| 156 |
+
f"STANNOFilter(threshold={self.config.anomaly_threshold}, "
|
| 157 |
+
f"block={self.config.block_above_threshold})"
|
| 158 |
+
)
|
stanno/integration/llm_client.py
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
LLM HTTP client.
|
| 3 |
+
|
| 4 |
+
Thin wrapper around httpx that talks to any OpenAI-compatible endpoint.
|
| 5 |
+
Credentials are read from environment variables (priority) or LLMConfig fields:
|
| 6 |
+
STANNO_LLM_BASE_URL — e.g. http://localhost:11434
|
| 7 |
+
STANNO_LLM_API_KEY — Bearer token (use "ollama" for Ollama)
|
| 8 |
+
STANNO_LLM_MODEL — e.g. llama3.2:3b
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
from __future__ import annotations
|
| 12 |
+
import os
|
| 13 |
+
from typing import Any, Dict, List, Optional
|
| 14 |
+
|
| 15 |
+
from stanno.config.schema import LLMConfig
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class LLMClient:
|
| 19 |
+
"""
|
| 20 |
+
Minimal OpenAI-compatible chat client.
|
| 21 |
+
|
| 22 |
+
Parameters
|
| 23 |
+
----------
|
| 24 |
+
config : LLMConfig
|
| 25 |
+
Configuration object. Environment variables override field values.
|
| 26 |
+
"""
|
| 27 |
+
|
| 28 |
+
def __init__(self, config: LLMConfig) -> None:
|
| 29 |
+
try:
|
| 30 |
+
import httpx
|
| 31 |
+
except ImportError as exc:
|
| 32 |
+
raise ImportError(
|
| 33 |
+
"httpx is required for LLM integration. "
|
| 34 |
+
"Install it with: pip install stanno[llm]"
|
| 35 |
+
) from exc
|
| 36 |
+
|
| 37 |
+
self._base_url: str = (
|
| 38 |
+
os.environ.get("STANNO_LLM_BASE_URL") or config.base_url
|
| 39 |
+
).rstrip("/")
|
| 40 |
+
self._api_key: str = (
|
| 41 |
+
os.environ.get("STANNO_LLM_API_KEY") or config.api_key or "none"
|
| 42 |
+
)
|
| 43 |
+
self._model: str = (
|
| 44 |
+
os.environ.get("STANNO_LLM_MODEL") or config.model
|
| 45 |
+
)
|
| 46 |
+
self._temperature: float = config.temperature
|
| 47 |
+
self._max_tokens: int = config.max_tokens
|
| 48 |
+
self._timeout: float = float(config.timeout_seconds)
|
| 49 |
+
self._http = httpx.Client(timeout=self._timeout)
|
| 50 |
+
|
| 51 |
+
# ── public API ────────────────────────────────────────────────────────────
|
| 52 |
+
|
| 53 |
+
def chat_completion(
|
| 54 |
+
self,
|
| 55 |
+
messages: List[Dict[str, str]],
|
| 56 |
+
**kwargs: Any,
|
| 57 |
+
) -> Dict[str, Any]:
|
| 58 |
+
"""
|
| 59 |
+
Send a chat completion request.
|
| 60 |
+
|
| 61 |
+
Parameters
|
| 62 |
+
----------
|
| 63 |
+
messages : list of {"role": str, "content": str}
|
| 64 |
+
**kwargs : overrides for temperature, max_tokens, model, etc.
|
| 65 |
+
|
| 66 |
+
Returns
|
| 67 |
+
-------
|
| 68 |
+
Full response dict (OpenAI format).
|
| 69 |
+
"""
|
| 70 |
+
payload: Dict[str, Any] = {
|
| 71 |
+
"model": kwargs.pop("model", self._model),
|
| 72 |
+
"messages": messages,
|
| 73 |
+
"temperature": kwargs.pop("temperature", self._temperature),
|
| 74 |
+
"max_tokens": kwargs.pop("max_tokens", self._max_tokens),
|
| 75 |
+
**kwargs,
|
| 76 |
+
}
|
| 77 |
+
headers = {"Authorization": f"Bearer {self._api_key}"}
|
| 78 |
+
response = self._http.post(
|
| 79 |
+
f"{self._base_url}/v1/chat/completions",
|
| 80 |
+
json=payload,
|
| 81 |
+
headers=headers,
|
| 82 |
+
)
|
| 83 |
+
response.raise_for_status()
|
| 84 |
+
return response.json()
|
| 85 |
+
|
| 86 |
+
def complete(
|
| 87 |
+
self,
|
| 88 |
+
prompt: str,
|
| 89 |
+
system: Optional[str] = None,
|
| 90 |
+
**kwargs: Any,
|
| 91 |
+
) -> str:
|
| 92 |
+
"""
|
| 93 |
+
Convenience wrapper. Returns just the assistant's reply text.
|
| 94 |
+
|
| 95 |
+
Parameters
|
| 96 |
+
----------
|
| 97 |
+
prompt : str
|
| 98 |
+
User message.
|
| 99 |
+
system : str, optional
|
| 100 |
+
System prompt.
|
| 101 |
+
"""
|
| 102 |
+
messages: List[Dict[str, str]] = []
|
| 103 |
+
if system:
|
| 104 |
+
messages.append({"role": "system", "content": system})
|
| 105 |
+
messages.append({"role": "user", "content": prompt})
|
| 106 |
+
result = self.chat_completion(messages, **kwargs)
|
| 107 |
+
return result["choices"][0]["message"]["content"]
|
| 108 |
+
|
| 109 |
+
def close(self) -> None:
|
| 110 |
+
"""Release underlying HTTP connection pool."""
|
| 111 |
+
self._http.close()
|
| 112 |
+
|
| 113 |
+
def __enter__(self) -> "LLMClient":
|
| 114 |
+
return self
|
| 115 |
+
|
| 116 |
+
def __exit__(self, *args: Any) -> None:
|
| 117 |
+
self.close()
|
| 118 |
+
|
| 119 |
+
def __repr__(self) -> str:
|
| 120 |
+
return (
|
| 121 |
+
f"LLMClient(base_url={self._base_url!r}, "
|
| 122 |
+
f"model={self._model!r})"
|
| 123 |
+
)
|
stanno/trainers/__init__.py
ADDED
|
File without changes
|
stanno/trainers/evolutionary.py
ADDED
|
@@ -0,0 +1,189 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
EvolutionaryTrainerNet — ES-based trainer, no autodiff required.
|
| 3 |
+
|
| 4 |
+
Strategy
|
| 5 |
+
────────
|
| 6 |
+
The trainer maintains a set of per-layer learning-rate multipliers (the
|
| 7 |
+
"genome"). At each meta-training step it generates a population of
|
| 8 |
+
perturbed genomes, evaluates each by running K training steps on a task,
|
| 9 |
+
and updates the genome toward the perturbations with the best fitness
|
| 10 |
+
(lowest post-update test loss) using the OpenAI ES gradient estimator.
|
| 11 |
+
|
| 12 |
+
During normal training (compute_updates) it delegates the gradient
|
| 13 |
+
computation to FixedTrainerNet and then scales the updates per-layer using
|
| 14 |
+
the evolved multipliers. Before any meta-training the multipliers are all
|
| 15 |
+
1.0, so behaviour is identical to FixedTrainerNet — making this a safe,
|
| 16 |
+
always-functional fallback.
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
from __future__ import annotations
|
| 20 |
+
from typing import List, Optional, Tuple
|
| 21 |
+
|
| 22 |
+
import numpy as np
|
| 23 |
+
|
| 24 |
+
from stanno.core.trainer import AbstractTrainerNet, TraineeState
|
| 25 |
+
from stanno.trainers.fixed import FixedTrainerNet
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class EvolutionaryTrainerNet(AbstractTrainerNet):
|
| 29 |
+
"""
|
| 30 |
+
ES-based trainer with per-layer learning-rate multipliers.
|
| 31 |
+
|
| 32 |
+
Parameters
|
| 33 |
+
----------
|
| 34 |
+
learning_rate : float
|
| 35 |
+
Base learning rate (passed to the internal FixedTrainerNet).
|
| 36 |
+
pop_size : int
|
| 37 |
+
ES population size.
|
| 38 |
+
sigma : float
|
| 39 |
+
ES perturbation standard deviation.
|
| 40 |
+
"""
|
| 41 |
+
|
| 42 |
+
def __init__(
|
| 43 |
+
self,
|
| 44 |
+
learning_rate: float = 0.01,
|
| 45 |
+
pop_size: int = 20,
|
| 46 |
+
sigma: float = 0.05,
|
| 47 |
+
seed: Optional[int] = None,
|
| 48 |
+
) -> None:
|
| 49 |
+
self.learning_rate = learning_rate
|
| 50 |
+
self.pop_size = pop_size
|
| 51 |
+
self.sigma = sigma
|
| 52 |
+
self._rng = np.random.default_rng(seed)
|
| 53 |
+
self._fixed = FixedTrainerNet(learning_rate=learning_rate)
|
| 54 |
+
# genome: per-layer scale factors, initialised to 1.0
|
| 55 |
+
# shape determined lazily on first compute_updates call
|
| 56 |
+
self._layer_scales: Optional[np.ndarray] = None
|
| 57 |
+
|
| 58 |
+
# ── compute_updates ───────────────────────────────────────────────────────
|
| 59 |
+
|
| 60 |
+
def compute_updates(
|
| 61 |
+
self, state: TraineeState
|
| 62 |
+
) -> Tuple[List[np.ndarray], List[np.ndarray]]:
|
| 63 |
+
"""
|
| 64 |
+
Delegate gradient computation to FixedTrainerNet, then scale
|
| 65 |
+
per-layer using evolved multipliers.
|
| 66 |
+
"""
|
| 67 |
+
n = len(state.weights)
|
| 68 |
+
|
| 69 |
+
# Lazy initialisation of scales (depends on number of layers)
|
| 70 |
+
if self._layer_scales is None or len(self._layer_scales) != n:
|
| 71 |
+
self._layer_scales = np.ones(n, dtype=np.float32)
|
| 72 |
+
|
| 73 |
+
weight_deltas, bias_deltas = self._fixed.compute_updates(state)
|
| 74 |
+
|
| 75 |
+
for i, scale in enumerate(self._layer_scales):
|
| 76 |
+
weight_deltas[i] *= scale
|
| 77 |
+
bias_deltas[i] *= scale
|
| 78 |
+
|
| 79 |
+
return weight_deltas, bias_deltas
|
| 80 |
+
|
| 81 |
+
# ── meta_train ────────────────────────────────────────────────────────────
|
| 82 |
+
|
| 83 |
+
def meta_train(
|
| 84 |
+
self,
|
| 85 |
+
tasks,
|
| 86 |
+
k_steps: int = 10,
|
| 87 |
+
n_iterations: int = 50,
|
| 88 |
+
) -> None:
|
| 89 |
+
"""
|
| 90 |
+
Evolve per-layer learning-rate multipliers to minimise post-update loss.
|
| 91 |
+
|
| 92 |
+
Each task must be a dict:
|
| 93 |
+
{"x_train": ndarray, "y_train": ndarray,
|
| 94 |
+
"x_test": ndarray, "y_test": ndarray,
|
| 95 |
+
"stanno_config": STANNOConfig} ← optional
|
| 96 |
+
"""
|
| 97 |
+
if not tasks:
|
| 98 |
+
return
|
| 99 |
+
|
| 100 |
+
# Infer n_layers from the first task's config or default
|
| 101 |
+
n_layers = self._infer_n_layers(tasks)
|
| 102 |
+
if self._layer_scales is None or len(self._layer_scales) != n_layers:
|
| 103 |
+
self._layer_scales = np.ones(n_layers, dtype=np.float32)
|
| 104 |
+
|
| 105 |
+
params = self._layer_scales.copy()
|
| 106 |
+
|
| 107 |
+
for iteration in range(n_iterations):
|
| 108 |
+
perturbations = self._rng.normal(0.0, self.sigma, (self.pop_size, n_layers)).astype(np.float32)
|
| 109 |
+
fitnesses = np.zeros(self.pop_size, dtype=np.float32)
|
| 110 |
+
|
| 111 |
+
for p_idx, pert in enumerate(perturbations):
|
| 112 |
+
candidate_scales = np.clip(params + pert, 0.0, 10.0)
|
| 113 |
+
loss = self._evaluate_tasks(tasks, k_steps, candidate_scales)
|
| 114 |
+
fitnesses[p_idx] = -loss # negate: higher = better
|
| 115 |
+
|
| 116 |
+
# Normalise fitness
|
| 117 |
+
std = fitnesses.std()
|
| 118 |
+
if std > 1e-8:
|
| 119 |
+
w = (fitnesses - fitnesses.mean()) / std
|
| 120 |
+
else:
|
| 121 |
+
w = np.zeros_like(fitnesses)
|
| 122 |
+
|
| 123 |
+
# ES gradient estimate
|
| 124 |
+
grad = (perturbations * w[:, np.newaxis]).mean(axis=0)
|
| 125 |
+
params = np.clip(params + self.sigma * grad, 0.0, 10.0)
|
| 126 |
+
|
| 127 |
+
if (iteration + 1) % 10 == 0:
|
| 128 |
+
best_loss = self._evaluate_tasks(tasks, k_steps, params)
|
| 129 |
+
print(f"[ES meta-train] iter {iteration+1:3d} loss={best_loss:.5f}")
|
| 130 |
+
|
| 131 |
+
self._layer_scales = params
|
| 132 |
+
|
| 133 |
+
# ── helpers ───────────────────────────────────────────────────────────────
|
| 134 |
+
|
| 135 |
+
def _infer_n_layers(self, tasks) -> int:
|
| 136 |
+
from stanno.config.schema import STANNOConfig
|
| 137 |
+
cfg = tasks[0].get("stanno_config")
|
| 138 |
+
if cfg is not None:
|
| 139 |
+
return len(cfg.layers) - 1
|
| 140 |
+
# Fallback: infer from data dimensionality
|
| 141 |
+
x = tasks[0]["x_train"]
|
| 142 |
+
y = tasks[0]["y_train"]
|
| 143 |
+
return 2 # default 2-layer net
|
| 144 |
+
|
| 145 |
+
def _evaluate_tasks(
|
| 146 |
+
self,
|
| 147 |
+
tasks,
|
| 148 |
+
k_steps: int,
|
| 149 |
+
layer_scales: np.ndarray,
|
| 150 |
+
) -> float:
|
| 151 |
+
"""Clone self with candidate scales and evaluate mean test loss."""
|
| 152 |
+
from stanno.config.schema import STANNOConfig
|
| 153 |
+
from stanno.core.stanno import STANNO
|
| 154 |
+
import copy
|
| 155 |
+
|
| 156 |
+
candidate = copy.deepcopy(self)
|
| 157 |
+
candidate._layer_scales = layer_scales
|
| 158 |
+
|
| 159 |
+
total_loss = 0.0
|
| 160 |
+
for task in tasks:
|
| 161 |
+
x_dim = task["x_train"].shape[1] if task["x_train"].ndim > 1 else 1
|
| 162 |
+
y_dim = task["y_train"].shape[1] if task["y_train"].ndim > 1 else 1
|
| 163 |
+
cfg = task.get(
|
| 164 |
+
"stanno_config",
|
| 165 |
+
STANNOConfig(layers=[x_dim, 32, y_dim]),
|
| 166 |
+
)
|
| 167 |
+
stanno = STANNO(cfg)
|
| 168 |
+
stanno.trainer = candidate
|
| 169 |
+
stanno.fit(
|
| 170 |
+
task["x_train"],
|
| 171 |
+
task["y_train"],
|
| 172 |
+
epochs=k_steps,
|
| 173 |
+
batch_size=min(32, len(task["x_train"])),
|
| 174 |
+
)
|
| 175 |
+
preds = stanno.predict(task["x_test"])
|
| 176 |
+
total_loss += float(np.mean((preds - task["y_test"]) ** 2))
|
| 177 |
+
|
| 178 |
+
return total_loss / max(len(tasks), 1)
|
| 179 |
+
|
| 180 |
+
def __repr__(self) -> str:
|
| 181 |
+
scales = (
|
| 182 |
+
np.round(self._layer_scales, 3).tolist()
|
| 183 |
+
if self._layer_scales is not None
|
| 184 |
+
else "uninitialised"
|
| 185 |
+
)
|
| 186 |
+
return (
|
| 187 |
+
f"EvolutionaryTrainerNet(lr={self.learning_rate}, "
|
| 188 |
+
f"pop={self.pop_size}, scales={scales})"
|
| 189 |
+
)
|
stanno/trainers/fixed.py
ADDED
|
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
FixedTrainerNet — patent-faithful 4-module trainer.
|
| 3 |
+
|
| 4 |
+
Implements the four modules described in US patent 5852815A (Thaler, 1998):
|
| 5 |
+
|
| 6 |
+
Module 1 — Activation sensitivity:
|
| 7 |
+
How do activations vary with small changes in pre-activation inputs?
|
| 8 |
+
Answers: tanh'(z_i) = 1 − tanh(z_i)² per hidden layer.
|
| 9 |
+
|
| 10 |
+
Module 2 — Activation derivatives:
|
| 11 |
+
The pre-synaptic activations feeding into each weight matrix.
|
| 12 |
+
These are activations[i] for weight W_i.
|
| 13 |
+
|
| 14 |
+
Module 3 — Error terms:
|
| 15 |
+
Delta (δ) signals propagated backward from the output error:
|
| 16 |
+
δ_{n-1} = (2/N) · (y_pred − y_true) [linear output]
|
| 17 |
+
δ_i = (δ_{i+1} @ W_{i+1}.T) · tanh'(z_i) [hidden layers]
|
| 18 |
+
|
| 19 |
+
Module 4 — Weight corrections:
|
| 20 |
+
ΔW_i = −lr · activations[i].T @ δ_i
|
| 21 |
+
Δb_i = −lr · Σ_batch(δ_i)
|
| 22 |
+
|
| 23 |
+
This is mathematically identical to standard backpropagation + SGD, but
|
| 24 |
+
structured as four named, encapsulated operations — making it faithful to
|
| 25 |
+
the STANNO concept of a network object that trains another network object.
|
| 26 |
+
"""
|
| 27 |
+
|
| 28 |
+
from __future__ import annotations
|
| 29 |
+
from typing import List, Optional, Tuple
|
| 30 |
+
|
| 31 |
+
import numpy as np
|
| 32 |
+
|
| 33 |
+
from stanno.core.trainer import AbstractTrainerNet, TraineeState
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
class FixedTrainerNet(AbstractTrainerNet):
|
| 37 |
+
"""
|
| 38 |
+
4-module trainer. Works with any depth TraineeNet. NumPy only.
|
| 39 |
+
|
| 40 |
+
Parameters
|
| 41 |
+
----------
|
| 42 |
+
learning_rate : float
|
| 43 |
+
Step size for weight updates.
|
| 44 |
+
"""
|
| 45 |
+
|
| 46 |
+
def __init__(self, learning_rate: float = 0.01) -> None:
|
| 47 |
+
self.learning_rate = learning_rate
|
| 48 |
+
|
| 49 |
+
# ── Module 1: activation sensitivity ────────────────────────────────────
|
| 50 |
+
|
| 51 |
+
def _module1_activation_sensitivity(
|
| 52 |
+
self, pre_activations: List[np.ndarray]
|
| 53 |
+
) -> List[np.ndarray]:
|
| 54 |
+
"""
|
| 55 |
+
Compute tanh'(z_i) = 1 − tanh(z_i)² for each hidden layer.
|
| 56 |
+
|
| 57 |
+
Returns a list of the same length as pre_activations.
|
| 58 |
+
For the output layer (linear) this is 1.0 everywhere, so the last
|
| 59 |
+
entry is an array of ones — no special casing needed.
|
| 60 |
+
"""
|
| 61 |
+
n = len(pre_activations)
|
| 62 |
+
sensitivities: List[np.ndarray] = []
|
| 63 |
+
for i, z in enumerate(pre_activations):
|
| 64 |
+
if i < n - 1:
|
| 65 |
+
# hidden layer: tanh activation
|
| 66 |
+
sensitivities.append(1.0 - np.tanh(z) ** 2)
|
| 67 |
+
else:
|
| 68 |
+
# output layer: linear activation, derivative = 1
|
| 69 |
+
sensitivities.append(np.ones_like(z))
|
| 70 |
+
return sensitivities
|
| 71 |
+
|
| 72 |
+
# ── Module 2: activation derivatives (pre-synaptic values) ──────────────
|
| 73 |
+
|
| 74 |
+
def _module2_pre_synaptic(
|
| 75 |
+
self, activations: List[np.ndarray]
|
| 76 |
+
) -> List[np.ndarray]:
|
| 77 |
+
"""
|
| 78 |
+
Return the pre-synaptic activation for each weight matrix.
|
| 79 |
+
|
| 80 |
+
activations[i] feeds into W_i, so pre_synaptic[i] = activations[i].
|
| 81 |
+
"""
|
| 82 |
+
return activations[:-1] # drop the final output; len == n_weights
|
| 83 |
+
|
| 84 |
+
# ── Module 3: error terms ────────────────────────────────────────────────
|
| 85 |
+
|
| 86 |
+
def _module3_error_terms(
|
| 87 |
+
self,
|
| 88 |
+
state: TraineeState,
|
| 89 |
+
sensitivities: List[np.ndarray],
|
| 90 |
+
output_delta: Optional[np.ndarray] = None,
|
| 91 |
+
) -> List[np.ndarray]:
|
| 92 |
+
"""
|
| 93 |
+
Compute delta (δ) error signals, propagating from output to input.
|
| 94 |
+
|
| 95 |
+
output_delta : ndarray (batch, output_dim), optional
|
| 96 |
+
If provided, use this as the pre-computed output-layer delta
|
| 97 |
+
(e.g. gradient from a downstream stage in a CascadeSTANNO).
|
| 98 |
+
When None the delta is computed from state.y_batch as usual.
|
| 99 |
+
|
| 100 |
+
Returns deltas[i] for each weight layer i (0 … n-1).
|
| 101 |
+
"""
|
| 102 |
+
n = len(state.weights)
|
| 103 |
+
deltas: List[np.ndarray] = [None] * n # type: ignore[list-item]
|
| 104 |
+
batch_size = state.y_pred.shape[0]
|
| 105 |
+
|
| 106 |
+
if output_delta is not None:
|
| 107 |
+
deltas[n - 1] = output_delta
|
| 108 |
+
else:
|
| 109 |
+
# Output layer delta: ∂MSE/∂y_pred, times output-layer sensitivity (=1)
|
| 110 |
+
deltas[n - 1] = (2.0 / batch_size) * (state.y_pred - state.y_batch)
|
| 111 |
+
|
| 112 |
+
# Propagate backward through hidden layers
|
| 113 |
+
for i in range(n - 2, -1, -1):
|
| 114 |
+
propagated = deltas[i + 1] @ state.weights[i + 1].T
|
| 115 |
+
deltas[i] = propagated * sensitivities[i]
|
| 116 |
+
|
| 117 |
+
return deltas
|
| 118 |
+
|
| 119 |
+
# ── Module 4: weight corrections ─────────────────────────────────────────
|
| 120 |
+
|
| 121 |
+
def _module4_weight_corrections(
|
| 122 |
+
self,
|
| 123 |
+
pre_synaptic: List[np.ndarray],
|
| 124 |
+
deltas: List[np.ndarray],
|
| 125 |
+
) -> Tuple[List[np.ndarray], List[np.ndarray]]:
|
| 126 |
+
"""
|
| 127 |
+
Compute ΔW_i and Δb_i for each layer.
|
| 128 |
+
|
| 129 |
+
ΔW_i = −lr · pre_synaptic[i].T @ δ_i
|
| 130 |
+
Δb_i = −lr · Σ_batch(δ_i)
|
| 131 |
+
"""
|
| 132 |
+
weight_deltas: List[np.ndarray] = []
|
| 133 |
+
bias_deltas: List[np.ndarray] = []
|
| 134 |
+
for pre, delta in zip(pre_synaptic, deltas):
|
| 135 |
+
weight_deltas.append(-self.learning_rate * (pre.T @ delta))
|
| 136 |
+
bias_deltas.append(-self.learning_rate * np.sum(delta, axis=0, keepdims=True))
|
| 137 |
+
return weight_deltas, bias_deltas
|
| 138 |
+
|
| 139 |
+
# ── public API ───────────────────────────────────────────────────────────
|
| 140 |
+
|
| 141 |
+
def compute_updates(
|
| 142 |
+
self, state: TraineeState
|
| 143 |
+
) -> Tuple[List[np.ndarray], List[np.ndarray]]:
|
| 144 |
+
"""Run all four modules and return (weight_deltas, bias_deltas)."""
|
| 145 |
+
sensitivities = self._module1_activation_sensitivity(state.pre_activations)
|
| 146 |
+
pre_synaptic = self._module2_pre_synaptic(state.activations)
|
| 147 |
+
deltas = self._module3_error_terms(state, sensitivities)
|
| 148 |
+
return self._module4_weight_corrections(pre_synaptic, deltas)
|
| 149 |
+
|
| 150 |
+
def compute_cascade_updates(
|
| 151 |
+
self,
|
| 152 |
+
state: TraineeState,
|
| 153 |
+
output_delta: Optional[np.ndarray] = None,
|
| 154 |
+
) -> Tuple[List[np.ndarray], List[np.ndarray], np.ndarray]:
|
| 155 |
+
"""
|
| 156 |
+
Cascade-aware variant: accepts an upstream output_delta and returns
|
| 157 |
+
the input_gradient for the preceding stage.
|
| 158 |
+
|
| 159 |
+
Parameters
|
| 160 |
+
----------
|
| 161 |
+
state : TraineeState
|
| 162 |
+
output_delta : ndarray (batch, output_dim), optional
|
| 163 |
+
Gradient from the downstream stage. None → compute from y_batch.
|
| 164 |
+
|
| 165 |
+
Returns
|
| 166 |
+
-------
|
| 167 |
+
weight_deltas, bias_deltas, input_gradient
|
| 168 |
+
input_gradient shape (batch, input_dim) — pass as output_delta to
|
| 169 |
+
the stage before this one in a CascadeSTANNO.
|
| 170 |
+
"""
|
| 171 |
+
sensitivities = self._module1_activation_sensitivity(state.pre_activations)
|
| 172 |
+
pre_synaptic = self._module2_pre_synaptic(state.activations)
|
| 173 |
+
deltas = self._module3_error_terms(state, sensitivities, output_delta)
|
| 174 |
+
dW, db = self._module4_weight_corrections(pre_synaptic, deltas)
|
| 175 |
+
# Gradient at input layer: propagate first hidden delta back through W[0]
|
| 176 |
+
input_gradient = deltas[0] @ state.weights[0].T
|
| 177 |
+
return dW, db, input_gradient
|
| 178 |
+
|
| 179 |
+
def __repr__(self) -> str:
|
| 180 |
+
return f"FixedTrainerNet(lr={self.learning_rate})"
|
stanno/trainers/local_rule.py
ADDED
|
@@ -0,0 +1,259 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
LocalRuleTrainerNet — per-synapse learned update rule.
|
| 3 |
+
|
| 4 |
+
Architecture
|
| 5 |
+
────────────
|
| 6 |
+
A small shared MLP (the "rule network") takes a 4-element feature vector
|
| 7 |
+
per synapse and outputs the weight delta for that synapse:
|
| 8 |
+
|
| 9 |
+
Input features per synapse W_i[j,k]:
|
| 10 |
+
[0] pre_j — mean pre-synaptic activation activations[i][:, j].mean()
|
| 11 |
+
[1] error_k — mean error signal at post-syn delta[i][:, k].mean()
|
| 12 |
+
[2] w_jk — current weight value W_i[j, k]
|
| 13 |
+
[3] is_bias — 0.0 for weights, 1.0 for biases
|
| 14 |
+
|
| 15 |
+
Output: Δw_jk (scalar weight delta)
|
| 16 |
+
|
| 17 |
+
The rule MLP is shared across ALL synapses in the network, which means it
|
| 18 |
+
generalises across layers and architectures.
|
| 19 |
+
|
| 20 |
+
Error signals (delta) are computed by FixedTrainerNet's Module 3 — the
|
| 21 |
+
local rule only replaces Module 4 (the actual weight correction formula).
|
| 22 |
+
|
| 23 |
+
Meta-training (Phase 2b)
|
| 24 |
+
─────────────────────────
|
| 25 |
+
meta_train() adapts the rule MLP itself. If PyTorch is available it uses
|
| 26 |
+
an unrolled K-step differentiation loop. Otherwise it falls back to the
|
| 27 |
+
EvolutionaryTrainerNet's ES routine on the rule MLP parameters.
|
| 28 |
+
"""
|
| 29 |
+
|
| 30 |
+
from __future__ import annotations
|
| 31 |
+
from typing import List, Optional, Tuple
|
| 32 |
+
|
| 33 |
+
import numpy as np
|
| 34 |
+
|
| 35 |
+
from stanno.core.trainer import AbstractTrainerNet, TraineeState
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
class LocalRuleTrainerNet(AbstractTrainerNet):
|
| 39 |
+
"""
|
| 40 |
+
Per-synapse learned update rule.
|
| 41 |
+
|
| 42 |
+
Parameters
|
| 43 |
+
----------
|
| 44 |
+
learning_rate : float
|
| 45 |
+
Scaling factor applied to all rule MLP outputs.
|
| 46 |
+
hidden_dim : int
|
| 47 |
+
Width of the rule MLP's single hidden layer.
|
| 48 |
+
seed : int, optional
|
| 49 |
+
RNG seed for reproducibility.
|
| 50 |
+
"""
|
| 51 |
+
|
| 52 |
+
# Feature vector size: [pre, error, weight, is_bias]
|
| 53 |
+
_FEATURE_DIM = 4
|
| 54 |
+
|
| 55 |
+
def __init__(
|
| 56 |
+
self,
|
| 57 |
+
learning_rate: float = 0.01,
|
| 58 |
+
hidden_dim: int = 16,
|
| 59 |
+
seed: Optional[int] = None,
|
| 60 |
+
) -> None:
|
| 61 |
+
self.learning_rate = learning_rate
|
| 62 |
+
self.hidden_dim = hidden_dim
|
| 63 |
+
rng = np.random.default_rng(seed)
|
| 64 |
+
scale = 0.1
|
| 65 |
+
self._W1 = rng.normal(0.0, scale, (self._FEATURE_DIM, hidden_dim)).astype(np.float32)
|
| 66 |
+
self._b1 = np.zeros((hidden_dim,), dtype=np.float32)
|
| 67 |
+
self._W2 = rng.normal(0.0, scale, (hidden_dim, 1)).astype(np.float32)
|
| 68 |
+
self._b2 = np.zeros((1,), dtype=np.float32)
|
| 69 |
+
|
| 70 |
+
# ── rule MLP ─────────────────────────────────────────────────────────────
|
| 71 |
+
|
| 72 |
+
def _rule_forward(self, features: np.ndarray) -> np.ndarray:
|
| 73 |
+
"""
|
| 74 |
+
Forward pass through the rule MLP.
|
| 75 |
+
|
| 76 |
+
features : (N_synapses, 4)
|
| 77 |
+
returns : (N_synapses,) — one delta per synapse
|
| 78 |
+
"""
|
| 79 |
+
h = np.tanh(features @ self._W1 + self._b1) # (N, hidden)
|
| 80 |
+
out = h @ self._W2 + self._b2 # (N, 1)
|
| 81 |
+
return out.ravel()
|
| 82 |
+
|
| 83 |
+
def _build_features(
|
| 84 |
+
self,
|
| 85 |
+
pre_mean: np.ndarray, # (in_dim,)
|
| 86 |
+
error_mean: np.ndarray, # (out_dim,)
|
| 87 |
+
weight: np.ndarray, # (in_dim, out_dim) or (1, out_dim) for biases
|
| 88 |
+
is_bias: float,
|
| 89 |
+
) -> np.ndarray:
|
| 90 |
+
"""
|
| 91 |
+
Build the (N_synapses, 4) feature matrix for one weight tensor.
|
| 92 |
+
|
| 93 |
+
For a weight matrix W (in_dim × out_dim), W is stored row-major so
|
| 94 |
+
W.ravel()[j * out_dim + k] = W[j, k]. Accordingly:
|
| 95 |
+
pre_grid[j*out + k] = pre_mean[j] (np.repeat)
|
| 96 |
+
err_grid[j*out + k] = error_mean[k] (np.tile)
|
| 97 |
+
"""
|
| 98 |
+
flat_w = weight.ravel()
|
| 99 |
+
n_syn = flat_w.size
|
| 100 |
+
|
| 101 |
+
if is_bias:
|
| 102 |
+
# bias: pre is always 1, one entry per output neuron
|
| 103 |
+
pre_col = np.ones(n_syn, dtype=np.float32)
|
| 104 |
+
err_col = error_mean.ravel()[:n_syn].astype(np.float32)
|
| 105 |
+
else:
|
| 106 |
+
in_dim, out_dim = weight.shape
|
| 107 |
+
pre_col = np.repeat(pre_mean, out_dim).astype(np.float32) # (in*out,)
|
| 108 |
+
err_col = np.tile(error_mean, in_dim).astype(np.float32) # (in*out,)
|
| 109 |
+
|
| 110 |
+
bias_col = np.full(n_syn, is_bias, dtype=np.float32)
|
| 111 |
+
|
| 112 |
+
return np.stack([pre_col, err_col, flat_w.astype(np.float32), bias_col], axis=1)
|
| 113 |
+
|
| 114 |
+
# ── error terms (reuse Fixed module 3) ───────────────────────────────────
|
| 115 |
+
|
| 116 |
+
def _compute_deltas(self, state: TraineeState) -> List[np.ndarray]:
|
| 117 |
+
"""Compute backward error signals using the same math as FixedTrainerNet."""
|
| 118 |
+
n = len(state.weights)
|
| 119 |
+
deltas: List[np.ndarray] = [None] * n # type: ignore[list-item]
|
| 120 |
+
batch_size = state.y_pred.shape[0]
|
| 121 |
+
deltas[n - 1] = (2.0 / batch_size) * (state.y_pred - state.y_batch)
|
| 122 |
+
for i in range(n - 2, -1, -1):
|
| 123 |
+
propagated = deltas[i + 1] @ state.weights[i + 1].T
|
| 124 |
+
# Use pre_activations (z) for derivative: tanh'(z) = 1 - tanh(z)²
|
| 125 |
+
# This is exact; using activations[i+1] is equivalent but less precise.
|
| 126 |
+
deltas[i] = propagated * (1.0 - np.tanh(state.pre_activations[i]) ** 2)
|
| 127 |
+
return deltas
|
| 128 |
+
|
| 129 |
+
# ── public API ───────────────────────────────────────────────────────────
|
| 130 |
+
|
| 131 |
+
def compute_updates(
|
| 132 |
+
self, state: TraineeState
|
| 133 |
+
) -> Tuple[List[np.ndarray], List[np.ndarray]]:
|
| 134 |
+
deltas = self._compute_deltas(state)
|
| 135 |
+
weight_deltas: List[np.ndarray] = []
|
| 136 |
+
bias_deltas: List[np.ndarray] = []
|
| 137 |
+
|
| 138 |
+
for i, (W, b, delta) in enumerate(
|
| 139 |
+
zip(state.weights, state.biases, deltas)
|
| 140 |
+
):
|
| 141 |
+
pre_mean = state.activations[i].mean(axis=0) # (in_dim,)
|
| 142 |
+
error_mean = delta.mean(axis=0) # (out_dim,)
|
| 143 |
+
|
| 144 |
+
# Weight update
|
| 145 |
+
feat_w = self._build_features(pre_mean, error_mean, W, is_bias=0.0)
|
| 146 |
+
dw = self._rule_forward(feat_w).reshape(W.shape) * self.learning_rate
|
| 147 |
+
weight_deltas.append(-dw)
|
| 148 |
+
|
| 149 |
+
# Bias update
|
| 150 |
+
feat_b = self._build_features(pre_mean, error_mean, b, is_bias=1.0)
|
| 151 |
+
db = self._rule_forward(feat_b).reshape(b.shape) * self.learning_rate
|
| 152 |
+
bias_deltas.append(-db)
|
| 153 |
+
|
| 154 |
+
return weight_deltas, bias_deltas
|
| 155 |
+
|
| 156 |
+
# ── meta-training (Phase 2b) ──────────────────────────────────────────────
|
| 157 |
+
|
| 158 |
+
def meta_train(self, tasks, k_steps: int = 5, meta_lr: float = 1e-3) -> None:
|
| 159 |
+
"""
|
| 160 |
+
Adapt the rule MLP to minimise post-update task loss.
|
| 161 |
+
|
| 162 |
+
If PyTorch is available: unrolled K-step gradient loop.
|
| 163 |
+
Otherwise: ES fallback on the rule MLP parameters.
|
| 164 |
+
|
| 165 |
+
Each task must be a dict:
|
| 166 |
+
{"x_train": ndarray, "y_train": ndarray,
|
| 167 |
+
"x_test": ndarray, "y_test": ndarray,
|
| 168 |
+
"stanno_config": STANNOConfig}
|
| 169 |
+
"""
|
| 170 |
+
try:
|
| 171 |
+
import torch
|
| 172 |
+
self._meta_train_torch(tasks, k_steps=k_steps, meta_lr=meta_lr)
|
| 173 |
+
except ImportError:
|
| 174 |
+
self._meta_train_es(tasks, k_steps=k_steps)
|
| 175 |
+
|
| 176 |
+
def _meta_train_es(
|
| 177 |
+
self,
|
| 178 |
+
tasks,
|
| 179 |
+
k_steps: int = 5,
|
| 180 |
+
pop_size: int = 20,
|
| 181 |
+
sigma: float = 0.05,
|
| 182 |
+
n_iterations: int = 50,
|
| 183 |
+
) -> None:
|
| 184 |
+
"""ES-based meta-training: perturb rule MLP params, keep best."""
|
| 185 |
+
from stanno.core.trainee import TraineeNet
|
| 186 |
+
from stanno.core.stanno import STANNO
|
| 187 |
+
from stanno.config.schema import STANNOConfig
|
| 188 |
+
|
| 189 |
+
rng = np.random.default_rng()
|
| 190 |
+
params = self._flat_params()
|
| 191 |
+
|
| 192 |
+
for _ in range(n_iterations):
|
| 193 |
+
perturbations = rng.normal(0.0, sigma, (pop_size, len(params))).astype(np.float32)
|
| 194 |
+
fitnesses = np.zeros(pop_size, dtype=np.float32)
|
| 195 |
+
|
| 196 |
+
for p_idx, pert in enumerate(perturbations):
|
| 197 |
+
candidate = params + pert
|
| 198 |
+
self._set_flat_params(candidate)
|
| 199 |
+
loss = self._evaluate_tasks(tasks, k_steps)
|
| 200 |
+
fitnesses[p_idx] = -loss # higher = better
|
| 201 |
+
|
| 202 |
+
# Fitness-weighted update
|
| 203 |
+
w = (fitnesses - fitnesses.mean()) / (fitnesses.std() + 1e-8)
|
| 204 |
+
gradient_estimate = (perturbations * w[:, np.newaxis]).mean(axis=0)
|
| 205 |
+
params = params + sigma * gradient_estimate
|
| 206 |
+
|
| 207 |
+
self._set_flat_params(params)
|
| 208 |
+
|
| 209 |
+
def _evaluate_tasks(self, tasks, k_steps: int) -> float:
|
| 210 |
+
"""Run k_steps of training on each task; return mean test MSE."""
|
| 211 |
+
from stanno.core.trainee import TraineeNet
|
| 212 |
+
from stanno.config.schema import STANNOConfig
|
| 213 |
+
import copy
|
| 214 |
+
|
| 215 |
+
total_loss = 0.0
|
| 216 |
+
for task in tasks:
|
| 217 |
+
cfg = task.get("stanno_config", STANNOConfig(layers=[task["x_train"].shape[1], 32, task["y_train"].shape[1]]))
|
| 218 |
+
net = TraineeNet(cfg.layers)
|
| 219 |
+
trainer = copy.copy(self) # use current rule params
|
| 220 |
+
|
| 221 |
+
from stanno.core.stanno import STANNO
|
| 222 |
+
stanno = STANNO(cfg)
|
| 223 |
+
stanno.net = net
|
| 224 |
+
stanno.trainer = trainer
|
| 225 |
+
stanno.fit(task["x_train"], task["y_train"], epochs=k_steps, batch_size=min(32, len(task["x_train"])))
|
| 226 |
+
|
| 227 |
+
preds = stanno.predict(task["x_test"])
|
| 228 |
+
total_loss += float(np.mean((preds - task["y_test"]) ** 2))
|
| 229 |
+
|
| 230 |
+
return total_loss / max(len(tasks), 1)
|
| 231 |
+
|
| 232 |
+
def _meta_train_torch(self, tasks, k_steps: int = 5, meta_lr: float = 1e-3) -> None:
|
| 233 |
+
"""Unrolled K-step meta-training via PyTorch autograd (Phase 2b)."""
|
| 234 |
+
# NOTE: Full implementation requires converting TraineeNet to
|
| 235 |
+
# differentiable PyTorch ops and running the update chain through
|
| 236 |
+
# autograd. This is left as a Phase 2b extension.
|
| 237 |
+
# For now, fall back to ES even when PyTorch is available.
|
| 238 |
+
self._meta_train_es(tasks, k_steps=k_steps)
|
| 239 |
+
|
| 240 |
+
# ── parameter serialisation helpers ────────────────────────��─────────────
|
| 241 |
+
|
| 242 |
+
def _flat_params(self) -> np.ndarray:
|
| 243 |
+
return np.concatenate([
|
| 244 |
+
self._W1.ravel(), self._b1.ravel(),
|
| 245 |
+
self._W2.ravel(), self._b2.ravel(),
|
| 246 |
+
])
|
| 247 |
+
|
| 248 |
+
def _set_flat_params(self, flat: np.ndarray) -> None:
|
| 249 |
+
idx = 0
|
| 250 |
+
for arr in [self._W1, self._b1, self._W2, self._b2]:
|
| 251 |
+
n = arr.size
|
| 252 |
+
arr.ravel()[:] = flat[idx: idx + n]
|
| 253 |
+
idx += n
|
| 254 |
+
|
| 255 |
+
def __repr__(self) -> str:
|
| 256 |
+
return (
|
| 257 |
+
f"LocalRuleTrainerNet(lr={self.learning_rate}, "
|
| 258 |
+
f"hidden_dim={self.hidden_dim})"
|
| 259 |
+
)
|
stanno_poc.py
ADDED
|
@@ -0,0 +1,314 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""STANNO-style proof of concept
|
| 2 |
+
|
| 3 |
+
This module implements a very simple Self-Training Artificial Neural Network Object (STANNO)
|
| 4 |
+
loosely inspired by Thaler's description: two neural networks, one of which trains the other,
|
| 5 |
+
optionally folded into a single object.[cite:1][cite:3]
|
| 6 |
+
|
| 7 |
+
Design choices:
|
| 8 |
+
- TraineeNet: a small multilayer perceptron (MLP) that learns a supervised mapping.
|
| 9 |
+
- Trainer: training logic embedded inside STANNO using standard gradient descent.
|
| 10 |
+
Conceptually this plays the role of the "trainer" network described in the literature,
|
| 11 |
+
but here it is implemented as explicit code for simplicity.
|
| 12 |
+
|
| 13 |
+
Features included for experimentation:
|
| 14 |
+
- Supervised training on a toy dataset (e.g., y = sin(x)).
|
| 15 |
+
- "Dreaming": run the trained net on a fixed or random latent input with inputs partially
|
| 16 |
+
or totally "blinded" (set to zero or constant) to observe internal dynamics.
|
| 17 |
+
- Noise injection: add Gaussian noise with adjustable standard deviation to all weights,
|
| 18 |
+
to explore how output complexity changes with noise level (from "stupidity" to chaos).
|
| 19 |
+
- Lesioning: randomly zero out a fraction of weights to mimic progressive "death" of
|
| 20 |
+
connections and observe degradation ("tunnel vision").[cite:2]
|
| 21 |
+
|
| 22 |
+
The goal is not to reproduce the original spreadsheet implementation, but to give a
|
| 23 |
+
simple, hackable playground in modern Python/NumPy that you can extend (including
|
| 24 |
+
replacing the hard-coded trainer by a learned meta-network if desired).
|
| 25 |
+
"""
|
| 26 |
+
|
| 27 |
+
from __future__ import annotations
|
| 28 |
+
import numpy as np
|
| 29 |
+
from dataclasses import dataclass
|
| 30 |
+
from typing import Tuple, Callable
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
@dataclass
|
| 34 |
+
class TraineeNet:
|
| 35 |
+
"""Simple 2-layer MLP (input -> hidden -> output).
|
| 36 |
+
|
| 37 |
+
This is the network that will be trained by the STANNO object.
|
| 38 |
+
"""
|
| 39 |
+
|
| 40 |
+
input_dim: int
|
| 41 |
+
hidden_dim: int
|
| 42 |
+
output_dim: int
|
| 43 |
+
|
| 44 |
+
def __post_init__(self) -> None:
|
| 45 |
+
rng = np.random.default_rng()
|
| 46 |
+
# Xavier-like initialization
|
| 47 |
+
self.W1 = rng.normal(0.0, 1.0 / np.sqrt(self.input_dim), (self.input_dim, self.hidden_dim))
|
| 48 |
+
self.b1 = np.zeros((1, self.hidden_dim))
|
| 49 |
+
self.W2 = rng.normal(0.0, 1.0 / np.sqrt(self.hidden_dim), (self.hidden_dim, self.output_dim))
|
| 50 |
+
self.b2 = np.zeros((1, self.output_dim))
|
| 51 |
+
|
| 52 |
+
def parameters(self):
|
| 53 |
+
return [self.W1, self.b1, self.W2, self.b2]
|
| 54 |
+
|
| 55 |
+
def forward(self, x: np.ndarray) -> Tuple[np.ndarray, dict]:
|
| 56 |
+
"""Forward pass returning output and cache for backprop."""
|
| 57 |
+
z1 = x @ self.W1 + self.b1
|
| 58 |
+
a1 = np.tanh(z1)
|
| 59 |
+
z2 = a1 @ self.W2 + self.b2
|
| 60 |
+
y = z2 # regression; for classification you could add softmax
|
| 61 |
+
cache = {"x": x, "z1": z1, "a1": a1, "z2": z2}
|
| 62 |
+
return y, cache
|
| 63 |
+
|
| 64 |
+
def apply_parameter_noise(self, sigma: float, rng: np.random.Generator | None = None) -> None:
|
| 65 |
+
"""Add Gaussian noise with std sigma to all parameters in-place."""
|
| 66 |
+
if sigma <= 0:
|
| 67 |
+
return
|
| 68 |
+
if rng is None:
|
| 69 |
+
rng = np.random.default_rng()
|
| 70 |
+
for p in self.parameters():
|
| 71 |
+
p += rng.normal(0.0, sigma, p.shape)
|
| 72 |
+
|
| 73 |
+
def lesion(self, fraction: float, rng: np.random.Generator | None = None) -> None:
|
| 74 |
+
"""Randomly zero out a fraction of weights (simulated neuron/connection death).
|
| 75 |
+
|
| 76 |
+
fraction in [0, 1]. Only affects W1 and W2; biases remain.
|
| 77 |
+
"""
|
| 78 |
+
fraction = float(np.clip(fraction, 0.0, 1.0))
|
| 79 |
+
if fraction <= 0:
|
| 80 |
+
return
|
| 81 |
+
if rng is None:
|
| 82 |
+
rng = np.random.default_rng()
|
| 83 |
+
for W in (self.W1, self.W2):
|
| 84 |
+
mask = rng.random(W.shape) < fraction
|
| 85 |
+
W[mask] = 0.0
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
class STANNO:
|
| 89 |
+
"""Self-Training Neural Network Object (STANNO-style).
|
| 90 |
+
|
| 91 |
+
Encapsula:
|
| 92 |
+
- Una red entrenable (TraineeNet).
|
| 93 |
+
- Un algoritmo de entrenamiento interno (gradient descent) que actúa como
|
| 94 |
+
"trainer" y actualiza los pesos a partir de ejemplos.
|
| 95 |
+
|
| 96 |
+
Esto sigue el espíritu de los STANNO descritos por Thaler: un objeto que
|
| 97 |
+
contiene la red y su mecanismo de entrenamiento, con capacidad de seguir
|
| 98 |
+
aprendiendo en línea.[cite:1][cite:3]
|
| 99 |
+
"""
|
| 100 |
+
|
| 101 |
+
def __init__(
|
| 102 |
+
self,
|
| 103 |
+
input_dim: int,
|
| 104 |
+
hidden_dim: int,
|
| 105 |
+
output_dim: int,
|
| 106 |
+
learning_rate: float = 1e-2,
|
| 107 |
+
) -> None:
|
| 108 |
+
self.net = TraineeNet(input_dim, hidden_dim, output_dim)
|
| 109 |
+
self.learning_rate = learning_rate
|
| 110 |
+
|
| 111 |
+
# ---------------------- Core training logic ----------------------
|
| 112 |
+
|
| 113 |
+
def _loss_and_grads(self, x: np.ndarray, y_true: np.ndarray) -> Tuple[float, list]:
|
| 114 |
+
"""Compute MSE loss and gradients via backprop for one batch."""
|
| 115 |
+
y_pred, cache = self.net.forward(x)
|
| 116 |
+
# Mean squared error
|
| 117 |
+
diff = y_pred - y_true
|
| 118 |
+
loss = float(np.mean(diff ** 2))
|
| 119 |
+
|
| 120 |
+
# Backprop
|
| 121 |
+
batch_size = x.shape[0]
|
| 122 |
+
dL_dy = (2.0 / batch_size) * diff # dL/dy
|
| 123 |
+
|
| 124 |
+
# Layer 2
|
| 125 |
+
a1 = cache["a1"]
|
| 126 |
+
dL_dW2 = a1.T @ dL_dy
|
| 127 |
+
dL_db2 = np.sum(dL_dy, axis=0, keepdims=True)
|
| 128 |
+
|
| 129 |
+
# Through tanh
|
| 130 |
+
dz2 = dL_dy @ self.net.W2.T
|
| 131 |
+
da1 = dz2
|
| 132 |
+
dz1 = da1 * (1.0 - np.tanh(cache["z1"]) ** 2)
|
| 133 |
+
|
| 134 |
+
# Layer 1
|
| 135 |
+
x_batch = cache["x"]
|
| 136 |
+
dL_dW1 = x_batch.T @ dz1
|
| 137 |
+
dL_db1 = np.sum(dz1, axis=0, keepdims=True)
|
| 138 |
+
|
| 139 |
+
grads = [dL_dW1, dL_db1, dL_dW2, dL_db2]
|
| 140 |
+
return loss, grads
|
| 141 |
+
|
| 142 |
+
def trainer_step(self, x: np.ndarray, y_true: np.ndarray) -> float:
|
| 143 |
+
"""One training step of the internal trainer over a mini-batch.
|
| 144 |
+
|
| 145 |
+
Conceptualmente, esto es el "trainer network" que ajusta pesos del
|
| 146 |
+
TraineeNet. Aquí se implementa como gradiente descendente directo.
|
| 147 |
+
"""
|
| 148 |
+
loss, grads = self._loss_and_grads(x, y_true)
|
| 149 |
+
for param, grad in zip(self.net.parameters(), grads):
|
| 150 |
+
param -= self.learning_rate * grad
|
| 151 |
+
return loss
|
| 152 |
+
|
| 153 |
+
def fit(
|
| 154 |
+
self,
|
| 155 |
+
x: np.ndarray,
|
| 156 |
+
y: np.ndarray,
|
| 157 |
+
epochs: int = 1000,
|
| 158 |
+
batch_size: int = 32,
|
| 159 |
+
shuffle: bool = True,
|
| 160 |
+
callback: Callable[[int, float], None] | None = None,
|
| 161 |
+
) -> None:
|
| 162 |
+
"""Train on a dataset using internal trainer.
|
| 163 |
+
|
| 164 |
+
Args:
|
| 165 |
+
x: shape (N, input_dim)
|
| 166 |
+
y: shape (N, output_dim)
|
| 167 |
+
epochs: number of passes over the dataset
|
| 168 |
+
batch_size: mini-batch size
|
| 169 |
+
shuffle: whether to shuffle each epoch
|
| 170 |
+
callback: optional function(epoch, loss) for logging
|
| 171 |
+
"""
|
| 172 |
+
N = x.shape[0]
|
| 173 |
+
rng = np.random.default_rng()
|
| 174 |
+
|
| 175 |
+
for epoch in range(epochs):
|
| 176 |
+
idx = np.arange(N)
|
| 177 |
+
if shuffle:
|
| 178 |
+
rng.shuffle(idx)
|
| 179 |
+
x_shuf = x[idx]
|
| 180 |
+
y_shuf = y[idx]
|
| 181 |
+
|
| 182 |
+
losses = []
|
| 183 |
+
for start in range(0, N, batch_size):
|
| 184 |
+
end = start + batch_size
|
| 185 |
+
xb = x_shuf[start:end]
|
| 186 |
+
yb = y_shuf[start:end]
|
| 187 |
+
loss = self.trainer_step(xb, yb)
|
| 188 |
+
losses.append(loss)
|
| 189 |
+
|
| 190 |
+
mean_loss = float(np.mean(losses))
|
| 191 |
+
if callback is not None:
|
| 192 |
+
callback(epoch, mean_loss)
|
| 193 |
+
|
| 194 |
+
# ---------------------- Inference & "dreaming" ----------------------
|
| 195 |
+
|
| 196 |
+
def predict(self, x: np.ndarray) -> np.ndarray:
|
| 197 |
+
y, _ = self.net.forward(x)
|
| 198 |
+
return y
|
| 199 |
+
|
| 200 |
+
def dream(
|
| 201 |
+
self,
|
| 202 |
+
num_steps: int = 128,
|
| 203 |
+
input_seed: np.ndarray | None = None,
|
| 204 |
+
noise_sigma: float = 0.0,
|
| 205 |
+
blind_inputs: bool = False,
|
| 206 |
+
rng: np.random.Generator | None = None,
|
| 207 |
+
) -> np.ndarray:
|
| 208 |
+
"""Generate a sequence of outputs by driving the net with a simple or blind input.
|
| 209 |
+
|
| 210 |
+
Args:
|
| 211 |
+
num_steps: length of the sequence to generate.
|
| 212 |
+
input_seed: initial input vector; if None, uses zeros.
|
| 213 |
+
noise_sigma: amount of noise to add to weights *once* before dreaming.
|
| 214 |
+
blind_inputs: if True, inputs are forced to zero every step.
|
| 215 |
+
rng: optional RNG.
|
| 216 |
+
|
| 217 |
+
Returns:
|
| 218 |
+
Array of generated outputs of shape (num_steps, output_dim).
|
| 219 |
+
"""
|
| 220 |
+
if rng is None:
|
| 221 |
+
rng = np.random.default_rng()
|
| 222 |
+
|
| 223 |
+
# Work on a copy so as not to permanently corrupt the trained net
|
| 224 |
+
shadow = TraineeNet(self.net.input_dim, self.net.hidden_dim, self.net.output_dim)
|
| 225 |
+
shadow.W1 = self.net.W1.copy()
|
| 226 |
+
shadow.b1 = self.net.b1.copy()
|
| 227 |
+
shadow.W2 = self.net.W2.copy()
|
| 228 |
+
shadow.b2 = self.net.b2.copy()
|
| 229 |
+
shadow.apply_parameter_noise(noise_sigma, rng=rng)
|
| 230 |
+
|
| 231 |
+
if input_seed is None:
|
| 232 |
+
x = np.zeros((1, self.net.input_dim))
|
| 233 |
+
else:
|
| 234 |
+
x = input_seed.reshape(1, -1)
|
| 235 |
+
|
| 236 |
+
outputs = []
|
| 237 |
+
for _ in range(num_steps):
|
| 238 |
+
if blind_inputs:
|
| 239 |
+
x_step = np.zeros_like(x)
|
| 240 |
+
else:
|
| 241 |
+
x_step = x
|
| 242 |
+
y, _ = shadow.forward(x_step)
|
| 243 |
+
outputs.append(y.copy())
|
| 244 |
+
# Simple feedback: feed output (or part of él) as next input
|
| 245 |
+
# This makes the sequence sensitive to internal weights.
|
| 246 |
+
if self.net.output_dim == self.net.input_dim:
|
| 247 |
+
x = y
|
| 248 |
+
else:
|
| 249 |
+
# Project or tile to match input dim
|
| 250 |
+
x = np.repeat(y, self.net.input_dim // self.net.output_dim + 1, axis=1)[
|
| 251 |
+
:, : self.net.input_dim
|
| 252 |
+
]
|
| 253 |
+
|
| 254 |
+
return np.concatenate(outputs, axis=0)
|
| 255 |
+
|
| 256 |
+
|
| 257 |
+
# ---------------------- Demo utilities ----------------------
|
| 258 |
+
|
| 259 |
+
def make_sin_dataset(n_samples: int = 256) -> Tuple[np.ndarray, np.ndarray]:
|
| 260 |
+
"""Simple 1D regression dataset: y = sin(x) on [0, 2π]."""
|
| 261 |
+
rng = np.random.default_rng()
|
| 262 |
+
x = rng.uniform(0.0, 2.0 * np.pi, size=(n_samples, 1))
|
| 263 |
+
y = np.sin(x)
|
| 264 |
+
return x, y
|
| 265 |
+
|
| 266 |
+
|
| 267 |
+
def demo_train_and_dream() -> None:
|
| 268 |
+
"""Train a STANNO on sin(x) and then explore noise/lesion effects.
|
| 269 |
+
|
| 270 |
+
Run this function directly ("python stanno_poc.py") to see numeric output.
|
| 271 |
+
"""
|
| 272 |
+
x, y = make_sin_dataset(512)
|
| 273 |
+
stanno = STANNO(input_dim=1, hidden_dim=32, output_dim=1, learning_rate=5e-3)
|
| 274 |
+
|
| 275 |
+
print("Training STANNO on y = sin(x)...")
|
| 276 |
+
stanno.fit(
|
| 277 |
+
x,
|
| 278 |
+
y,
|
| 279 |
+
epochs=500,
|
| 280 |
+
batch_size=64,
|
| 281 |
+
callback=lambda e, l: print(f"Epoch {e:4d} loss={l:.5f}") if (e + 1) % 100 == 0 else None,
|
| 282 |
+
)
|
| 283 |
+
|
| 284 |
+
# Evaluate basic fit
|
| 285 |
+
xs = np.linspace(0, 2 * np.pi, 16).reshape(-1, 1)
|
| 286 |
+
preds = stanno.predict(xs)
|
| 287 |
+
print("
|
| 288 |
+
Sample predictions after training:")
|
| 289 |
+
for xi, yi, yi_hat in zip(xs.flatten(), np.sin(xs).flatten(), preds.flatten()):
|
| 290 |
+
print(f"x={xi:5.2f} sin(x)={yi: .3f} pred={yi_hat: .3f}")
|
| 291 |
+
|
| 292 |
+
# Dreaming with different noise levels
|
| 293 |
+
for sigma in [0.0, 0.05, 0.2, 0.5]:
|
| 294 |
+
seq = stanno.dream(num_steps=32, noise_sigma=sigma, blind_inputs=True)
|
| 295 |
+
print(f"
|
| 296 |
+
Dreaming with noise_sigma={sigma} (first 10 outputs):")
|
| 297 |
+
print(np.round(seq[:10].flatten(), 3))
|
| 298 |
+
|
| 299 |
+
# Lesion experiment
|
| 300 |
+
print("
|
| 301 |
+
Lesioning 70% of weights and evaluating error on test points...")
|
| 302 |
+
# Backup parameters
|
| 303 |
+
backup = [p.copy() for p in stanno.net.parameters()]
|
| 304 |
+
stanno.net.lesion(fraction=0.7)
|
| 305 |
+
preds_lesioned = stanno.predict(xs)
|
| 306 |
+
mse_lesioned = float(np.mean((preds_lesioned - np.sin(xs)) ** 2))
|
| 307 |
+
print(f"MSE after lesioning 70% of weights: {mse_lesioned:.4f}")
|
| 308 |
+
# Restore
|
| 309 |
+
for param, b in zip(stanno.net.parameters(), backup):
|
| 310 |
+
param[...] = b
|
| 311 |
+
|
| 312 |
+
|
| 313 |
+
if __name__ == "__main__":
|
| 314 |
+
demo_train_and_dream()
|