"""
Simulation engine for deep network signal propagation.

This module simulates how signals propagate through deep residual networks
with different residual mixing strategies:

- baseline: Identity matrices (no mixing, standard residual connections)
- hc: Random unconstrained matrices (Hyper-Connections)
- mhc: Sinkhorn-projected doubly stochastic matrices (Manifold-Constrained HC)

Key insight from the mHC paper:
The COMPOSITE mapping (product of all layer matrices H_L @ H_{L-1} @ ... @ H_0)
is what matters for signal propagation:
- For HC: composite gains explode exponentially (3000x+ at depth 64)
- For mHC: composite gains stay bounded (~1.6x at depth 64)

This happens because doubly stochastic matrices are closed under multiplication.

Author: Subhadip Mitra <contact@subhadipmitra.com>
Based on DeepSeek's mHC paper: https://arxiv.org/abs/2512.24880
"""

import numpy as np
from typing import Dict, Literal, Optional

from .sinkhorn import sinkhorn_knopp
from .metrics import compute_all_metrics


def generate_residual_matrix(
    n: int,
    method: Literal['baseline', 'hc', 'mhc'],
    sinkhorn_iters: int = 20,
    rng: Optional[np.random.Generator] = None
) -> np.ndarray:
    """
    Generate a residual mixing matrix.

    Args:
        n: Size of square matrix (number of streams)
        method: One of:
            - 'baseline': Identity matrix (no mixing)
            - 'hc': Random matrix with N(0, 1) entries
            - 'mhc': Random matrix projected to doubly stochastic via Sinkhorn
        sinkhorn_iters: Number of Sinkhorn iterations for mHC method
        rng: Random number generator for reproducibility

    Returns:
        Residual mixing matrix of shape (n, n)

    Example:
        >>> rng = np.random.default_rng(42)
        >>> M = generate_residual_matrix(4, 'mhc', sinkhorn_iters=20, rng=rng)
        >>> M.shape
        (4, 4)
    """
    if rng is None:
        rng = np.random.default_rng()

    if method == 'baseline':
        return np.eye(n)

    # Generate random matrix for HC and mHC
    M = rng.standard_normal((n, n))

    if method == 'hc':
        return M

    if method == 'mhc':
        # At k=0, return raw random matrix (same as HC) to show explosive behavior
        # At k>0, apply Sinkhorn projection to show transition to stability
        if sinkhorn_iters == 0:
            return M
        return sinkhorn_knopp(M, iterations=sinkhorn_iters)

    raise ValueError(f"Unknown method: {method}. Expected 'baseline', 'hc', or 'mhc'.")


def simulate_depth(
    depth: int,
    n: int,
    method: Literal['baseline', 'hc', 'mhc'],
    sinkhorn_iters: int = 20,
    seed: int = 42
) -> Dict:
    """
    Simulate signal propagation through a deep residual network.

    This function generates `depth` residual matrices and computes both
    per-layer metrics and cumulative composite metrics at each depth.

    The composite mapping at layer l is:
        Composite(l) = H_l @ H_{l-1} @ ... @ H_1 @ H_0

    This represents the total transformation applied to signals from
    the input to layer l.

    Args:
        depth: Number of layers to simulate
        n: Matrix size (number of streams in multi-stream residual)
        method: Residual mixing strategy ('baseline', 'hc', or 'mhc')
        sinkhorn_iters: Number of Sinkhorn iterations for mHC
        seed: Random seed for reproducibility

    Returns:
        Dict containing:
        - 'method': str - the method used
        - 'depth': int - number of layers
        - 'n': int - matrix size
        - 'sinkhorn_iters': int - Sinkhorn iterations used
        - 'seed': int - random seed used
        - 'per_layer': list of dicts with metrics for each layer's matrix
        - 'composite': list of dicts with metrics for composite at each depth

    Example:
        >>> result = simulate_depth(64, 4, 'mhc', seed=42)
        >>> result['composite'][-1]['forward_gain'] < 5
        True
    """
    rng = np.random.default_rng(seed)

    per_layer = []
    composite_metrics = []

    composite = np.eye(n)  # Start with identity

    for layer_idx in range(depth):
        # Generate this layer's residual matrix
        H = generate_residual_matrix(n, method, sinkhorn_iters, rng)

        # Store per-layer metrics
        per_layer.append({
            'layer': layer_idx,
            **compute_all_metrics(H)
        })

        # Update composite: multiply from the left
        # Composite(l) = H_l @ Composite(l-1) = H_l @ H_{l-1} @ ... @ H_0
        composite = H @ composite

        # Store composite metrics at this depth
        composite_metrics.append({
            'upto_layer': layer_idx,
            **compute_all_metrics(composite)
        })

    return {
        'method': method,
        'depth': depth,
        'n': n,
        'sinkhorn_iters': sinkhorn_iters,
        'seed': seed,
        'per_layer': per_layer,
        'composite': composite_metrics,
    }


def run_comparison(
    depth: int = 64,
    n: int = 4,
    sinkhorn_iters: int = 20,
    seed: int = 42
) -> Dict:
    """
    Run simulation for all three methods and return comparison.

    This is the main entry point for generating comparison data.
    It runs simulate_depth for baseline, HC, and mHC with the same
    parameters, making direct comparison possible.

    Args:
        depth: Number of layers to simulate
        n: Matrix size (number of streams)
        sinkhorn_iters: Number of Sinkhorn iterations for mHC
        seed: Random seed (same seed used for all methods for fair comparison)

    Returns:
        Dict with keys 'baseline', 'hc', 'mhc' containing simulation results

    Example:
        >>> results = run_comparison(depth=64, n=4, seed=42)
        >>> # Baseline should stay at 1
        >>> results['baseline']['composite'][-1]['forward_gain']
        1.0
        >>> # HC should explode
        >>> results['hc']['composite'][-1]['forward_gain'] > 10
        True
        >>> # mHC should stay bounded
        >>> results['mhc']['composite'][-1]['forward_gain'] < 5
        True
    """
    return {
        'baseline': simulate_depth(depth, n, 'baseline', sinkhorn_iters, seed),
        'hc': simulate_depth(depth, n, 'hc', sinkhorn_iters, seed),
        'mhc': simulate_depth(depth, n, 'mhc', sinkhorn_iters, seed),
    }


if __name__ == "__main__":
    # Quick demo when run directly
    print("Running mHC simulation comparison...")
    print("=" * 50)

    results = run_comparison(depth=64, n=4, seed=42)

    for method in ['baseline', 'hc', 'mhc']:
        final_composite = results[method]['composite'][-1]
        print(f"\n{method.upper()}:")
        print(f"  Final composite forward_gain:  {final_composite['forward_gain']:.4f}")
        print(f"  Final composite backward_gain: {final_composite['backward_gain']:.4f}")
        print(f"  Final composite spectral_norm: {final_composite['spectral_norm']:.4f}")