Spaces:
Sleeping
Sleeping
File size: 6,886 Bytes
8ab7e68 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 | """
Simulation engine for deep network signal propagation.
This module simulates how signals propagate through deep residual networks
with different residual mixing strategies:
- baseline: Identity matrices (no mixing, standard residual connections)
- hc: Random unconstrained matrices (Hyper-Connections)
- mhc: Sinkhorn-projected doubly stochastic matrices (Manifold-Constrained HC)
Key insight from the mHC paper:
The COMPOSITE mapping (product of all layer matrices H_L @ H_{L-1} @ ... @ H_0)
is what matters for signal propagation:
- For HC: composite gains explode exponentially (3000x+ at depth 64)
- For mHC: composite gains stay bounded (~1.6x at depth 64)
This happens because doubly stochastic matrices are closed under multiplication.
Author: Subhadip Mitra <contact@subhadipmitra.com>
Based on DeepSeek's mHC paper: https://arxiv.org/abs/2512.24880
"""
import numpy as np
from typing import Dict, Literal, Optional
from .sinkhorn import sinkhorn_knopp
from .metrics import compute_all_metrics
def generate_residual_matrix(
n: int,
method: Literal['baseline', 'hc', 'mhc'],
sinkhorn_iters: int = 20,
rng: Optional[np.random.Generator] = None
) -> np.ndarray:
"""
Generate a residual mixing matrix.
Args:
n: Size of square matrix (number of streams)
method: One of:
- 'baseline': Identity matrix (no mixing)
- 'hc': Random matrix with N(0, 1) entries
- 'mhc': Random matrix projected to doubly stochastic via Sinkhorn
sinkhorn_iters: Number of Sinkhorn iterations for mHC method
rng: Random number generator for reproducibility
Returns:
Residual mixing matrix of shape (n, n)
Example:
>>> rng = np.random.default_rng(42)
>>> M = generate_residual_matrix(4, 'mhc', sinkhorn_iters=20, rng=rng)
>>> M.shape
(4, 4)
"""
if rng is None:
rng = np.random.default_rng()
if method == 'baseline':
return np.eye(n)
# Generate random matrix for HC and mHC
M = rng.standard_normal((n, n))
if method == 'hc':
return M
if method == 'mhc':
# At k=0, return raw random matrix (same as HC) to show explosive behavior
# At k>0, apply Sinkhorn projection to show transition to stability
if sinkhorn_iters == 0:
return M
return sinkhorn_knopp(M, iterations=sinkhorn_iters)
raise ValueError(f"Unknown method: {method}. Expected 'baseline', 'hc', or 'mhc'.")
def simulate_depth(
depth: int,
n: int,
method: Literal['baseline', 'hc', 'mhc'],
sinkhorn_iters: int = 20,
seed: int = 42
) -> Dict:
"""
Simulate signal propagation through a deep residual network.
This function generates `depth` residual matrices and computes both
per-layer metrics and cumulative composite metrics at each depth.
The composite mapping at layer l is:
Composite(l) = H_l @ H_{l-1} @ ... @ H_1 @ H_0
This represents the total transformation applied to signals from
the input to layer l.
Args:
depth: Number of layers to simulate
n: Matrix size (number of streams in multi-stream residual)
method: Residual mixing strategy ('baseline', 'hc', or 'mhc')
sinkhorn_iters: Number of Sinkhorn iterations for mHC
seed: Random seed for reproducibility
Returns:
Dict containing:
- 'method': str - the method used
- 'depth': int - number of layers
- 'n': int - matrix size
- 'sinkhorn_iters': int - Sinkhorn iterations used
- 'seed': int - random seed used
- 'per_layer': list of dicts with metrics for each layer's matrix
- 'composite': list of dicts with metrics for composite at each depth
Example:
>>> result = simulate_depth(64, 4, 'mhc', seed=42)
>>> result['composite'][-1]['forward_gain'] < 5
True
"""
rng = np.random.default_rng(seed)
per_layer = []
composite_metrics = []
composite = np.eye(n) # Start with identity
for layer_idx in range(depth):
# Generate this layer's residual matrix
H = generate_residual_matrix(n, method, sinkhorn_iters, rng)
# Store per-layer metrics
per_layer.append({
'layer': layer_idx,
**compute_all_metrics(H)
})
# Update composite: multiply from the left
# Composite(l) = H_l @ Composite(l-1) = H_l @ H_{l-1} @ ... @ H_0
composite = H @ composite
# Store composite metrics at this depth
composite_metrics.append({
'upto_layer': layer_idx,
**compute_all_metrics(composite)
})
return {
'method': method,
'depth': depth,
'n': n,
'sinkhorn_iters': sinkhorn_iters,
'seed': seed,
'per_layer': per_layer,
'composite': composite_metrics,
}
def run_comparison(
depth: int = 64,
n: int = 4,
sinkhorn_iters: int = 20,
seed: int = 42
) -> Dict:
"""
Run simulation for all three methods and return comparison.
This is the main entry point for generating comparison data.
It runs simulate_depth for baseline, HC, and mHC with the same
parameters, making direct comparison possible.
Args:
depth: Number of layers to simulate
n: Matrix size (number of streams)
sinkhorn_iters: Number of Sinkhorn iterations for mHC
seed: Random seed (same seed used for all methods for fair comparison)
Returns:
Dict with keys 'baseline', 'hc', 'mhc' containing simulation results
Example:
>>> results = run_comparison(depth=64, n=4, seed=42)
>>> # Baseline should stay at 1
>>> results['baseline']['composite'][-1]['forward_gain']
1.0
>>> # HC should explode
>>> results['hc']['composite'][-1]['forward_gain'] > 10
True
>>> # mHC should stay bounded
>>> results['mhc']['composite'][-1]['forward_gain'] < 5
True
"""
return {
'baseline': simulate_depth(depth, n, 'baseline', sinkhorn_iters, seed),
'hc': simulate_depth(depth, n, 'hc', sinkhorn_iters, seed),
'mhc': simulate_depth(depth, n, 'mhc', sinkhorn_iters, seed),
}
if __name__ == "__main__":
# Quick demo when run directly
print("Running mHC simulation comparison...")
print("=" * 50)
results = run_comparison(depth=64, n=4, seed=42)
for method in ['baseline', 'hc', 'mhc']:
final_composite = results[method]['composite'][-1]
print(f"\n{method.upper()}:")
print(f" Final composite forward_gain: {final_composite['forward_gain']:.4f}")
print(f" Final composite backward_gain: {final_composite['backward_gain']:.4f}")
print(f" Final composite spectral_norm: {final_composite['spectral_norm']:.4f}")
|