File size: 6,886 Bytes
8ab7e68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
"""
Simulation engine for deep network signal propagation.

This module simulates how signals propagate through deep residual networks
with different residual mixing strategies:

- baseline: Identity matrices (no mixing, standard residual connections)
- hc: Random unconstrained matrices (Hyper-Connections)
- mhc: Sinkhorn-projected doubly stochastic matrices (Manifold-Constrained HC)

Key insight from the mHC paper:
The COMPOSITE mapping (product of all layer matrices H_L @ H_{L-1} @ ... @ H_0)
is what matters for signal propagation:
- For HC: composite gains explode exponentially (3000x+ at depth 64)
- For mHC: composite gains stay bounded (~1.6x at depth 64)

This happens because doubly stochastic matrices are closed under multiplication.

Author: Subhadip Mitra <contact@subhadipmitra.com>
Based on DeepSeek's mHC paper: https://arxiv.org/abs/2512.24880
"""

import numpy as np
from typing import Dict, Literal, Optional

from .sinkhorn import sinkhorn_knopp
from .metrics import compute_all_metrics


def generate_residual_matrix(
    n: int,
    method: Literal['baseline', 'hc', 'mhc'],
    sinkhorn_iters: int = 20,
    rng: Optional[np.random.Generator] = None
) -> np.ndarray:
    """
    Generate a residual mixing matrix.

    Args:
        n: Size of square matrix (number of streams)
        method: One of:
            - 'baseline': Identity matrix (no mixing)
            - 'hc': Random matrix with N(0, 1) entries
            - 'mhc': Random matrix projected to doubly stochastic via Sinkhorn
        sinkhorn_iters: Number of Sinkhorn iterations for mHC method
        rng: Random number generator for reproducibility

    Returns:
        Residual mixing matrix of shape (n, n)

    Example:
        >>> rng = np.random.default_rng(42)
        >>> M = generate_residual_matrix(4, 'mhc', sinkhorn_iters=20, rng=rng)
        >>> M.shape
        (4, 4)
    """
    if rng is None:
        rng = np.random.default_rng()

    if method == 'baseline':
        return np.eye(n)

    # Generate random matrix for HC and mHC
    M = rng.standard_normal((n, n))

    if method == 'hc':
        return M

    if method == 'mhc':
        # At k=0, return raw random matrix (same as HC) to show explosive behavior
        # At k>0, apply Sinkhorn projection to show transition to stability
        if sinkhorn_iters == 0:
            return M
        return sinkhorn_knopp(M, iterations=sinkhorn_iters)

    raise ValueError(f"Unknown method: {method}. Expected 'baseline', 'hc', or 'mhc'.")


def simulate_depth(
    depth: int,
    n: int,
    method: Literal['baseline', 'hc', 'mhc'],
    sinkhorn_iters: int = 20,
    seed: int = 42
) -> Dict:
    """
    Simulate signal propagation through a deep residual network.

    This function generates `depth` residual matrices and computes both
    per-layer metrics and cumulative composite metrics at each depth.

    The composite mapping at layer l is:
        Composite(l) = H_l @ H_{l-1} @ ... @ H_1 @ H_0

    This represents the total transformation applied to signals from
    the input to layer l.

    Args:
        depth: Number of layers to simulate
        n: Matrix size (number of streams in multi-stream residual)
        method: Residual mixing strategy ('baseline', 'hc', or 'mhc')
        sinkhorn_iters: Number of Sinkhorn iterations for mHC
        seed: Random seed for reproducibility

    Returns:
        Dict containing:
        - 'method': str - the method used
        - 'depth': int - number of layers
        - 'n': int - matrix size
        - 'sinkhorn_iters': int - Sinkhorn iterations used
        - 'seed': int - random seed used
        - 'per_layer': list of dicts with metrics for each layer's matrix
        - 'composite': list of dicts with metrics for composite at each depth

    Example:
        >>> result = simulate_depth(64, 4, 'mhc', seed=42)
        >>> result['composite'][-1]['forward_gain'] < 5
        True
    """
    rng = np.random.default_rng(seed)

    per_layer = []
    composite_metrics = []

    composite = np.eye(n)  # Start with identity

    for layer_idx in range(depth):
        # Generate this layer's residual matrix
        H = generate_residual_matrix(n, method, sinkhorn_iters, rng)

        # Store per-layer metrics
        per_layer.append({
            'layer': layer_idx,
            **compute_all_metrics(H)
        })

        # Update composite: multiply from the left
        # Composite(l) = H_l @ Composite(l-1) = H_l @ H_{l-1} @ ... @ H_0
        composite = H @ composite

        # Store composite metrics at this depth
        composite_metrics.append({
            'upto_layer': layer_idx,
            **compute_all_metrics(composite)
        })

    return {
        'method': method,
        'depth': depth,
        'n': n,
        'sinkhorn_iters': sinkhorn_iters,
        'seed': seed,
        'per_layer': per_layer,
        'composite': composite_metrics,
    }


def run_comparison(
    depth: int = 64,
    n: int = 4,
    sinkhorn_iters: int = 20,
    seed: int = 42
) -> Dict:
    """
    Run simulation for all three methods and return comparison.

    This is the main entry point for generating comparison data.
    It runs simulate_depth for baseline, HC, and mHC with the same
    parameters, making direct comparison possible.

    Args:
        depth: Number of layers to simulate
        n: Matrix size (number of streams)
        sinkhorn_iters: Number of Sinkhorn iterations for mHC
        seed: Random seed (same seed used for all methods for fair comparison)

    Returns:
        Dict with keys 'baseline', 'hc', 'mhc' containing simulation results

    Example:
        >>> results = run_comparison(depth=64, n=4, seed=42)
        >>> # Baseline should stay at 1
        >>> results['baseline']['composite'][-1]['forward_gain']
        1.0
        >>> # HC should explode
        >>> results['hc']['composite'][-1]['forward_gain'] > 10
        True
        >>> # mHC should stay bounded
        >>> results['mhc']['composite'][-1]['forward_gain'] < 5
        True
    """
    return {
        'baseline': simulate_depth(depth, n, 'baseline', sinkhorn_iters, seed),
        'hc': simulate_depth(depth, n, 'hc', sinkhorn_iters, seed),
        'mhc': simulate_depth(depth, n, 'mhc', sinkhorn_iters, seed),
    }


if __name__ == "__main__":
    # Quick demo when run directly
    print("Running mHC simulation comparison...")
    print("=" * 50)

    results = run_comparison(depth=64, n=4, seed=42)

    for method in ['baseline', 'hc', 'mhc']:
        final_composite = results[method]['composite'][-1]
        print(f"\n{method.upper()}:")
        print(f"  Final composite forward_gain:  {final_composite['forward_gain']:.4f}")
        print(f"  Final composite backward_gain: {final_composite['backward_gain']:.4f}")
        print(f"  Final composite spectral_norm: {final_composite['spectral_norm']:.4f}")