Spaces:

dylan-marimo-io
/

mhc-stability

Sleeping

App Files Files Community

dylan-marimo-io commited on Jan 5

Commit

8ab7e68

verified ·

1 Parent(s): ef9f423

Upload 7 files

Browse files

Files changed (7) hide show

mhc/__init__.py +27 -0
mhc/hyper_connections.py +309 -0
mhc/metrics.py +160 -0
mhc/simulation.py +211 -0
mhc/sinkhorn.py +148 -0
mhc/torch_module.py +280 -0
requirements.txt +3 -5

mhc/__init__.py ADDED Viewed

	@@ -0,0 +1,27 @@

+"""
+mHC (Manifold-Constrained Hyper-Connections) visualization library.
+This package provides tools for demonstrating the stability properties
+of mHC residual connections compared to unconstrained HC and baseline methods.
+Modules:
+- sinkhorn: Sinkhorn-Knopp projection onto doubly stochastic matrices
+- metrics: Stability metrics (forward_gain, backward_gain, spectral_norm)
+- simulation: Deep network signal propagation simulation
+- torch_module: PyTorch implementation for use in neural networks
+Author: Subhadip Mitra <contact@subhadipmitra.com>
+Based on DeepSeek's mHC paper: https://arxiv.org/abs/2512.24880
+"""
+from .sinkhorn import sinkhorn_knopp, is_doubly_stochastic, projection_error
+from .metrics import forward_gain, backward_gain, spectral_norm, compute_all_metrics
+from .simulation import generate_residual_matrix, simulate_depth, run_comparison
+# PyTorch modules (optional import - requires torch)
+try:
+    from .torch_module import SinkhornKnopp, mHCResidual, mHCBlock, create_mhc_mlp
+except ImportError:
+    pass  # torch not installed
+__version__ = "0.1.0"

mhc/hyper_connections.py ADDED Viewed

	@@ -0,0 +1,309 @@

+"""
+PyTorch implementation of Hyper-Connections (HC) and mHC.
+HC extends residual connections with multiple parallel streams and learned mixing.
+mHC constrains the mixing matrix to be doubly stochastic via Sinkhorn-Knopp.
+Reference: https://arxiv.org/abs/2512.24880
+"""
+import torch
+import torch.nn as nn
+def sinkhorn_knopp_torch(M: torch.Tensor, iters: int = 20, eps: float = 1e-8) -> torch.Tensor:
+    """
+    Differentiable Sinkhorn-Knopp projection to doubly stochastic matrix.
+    Args:
+        M: Input matrix of shape (n, n)
+        iters: Number of alternating normalization iterations
+        eps: Small value for numerical stability
+    Returns:
+        Approximately doubly stochastic matrix (rows and cols sum to 1)
+    """
+    P = torch.exp(M - M.max())
+    for _ in range(iters):
+        P = P / (P.sum(dim=-1, keepdim=True) + eps)
+        P = P / (P.sum(dim=-2, keepdim=True) + eps)
+    return P
+class HyperConnections(nn.Module):
+    """
+    Hyper-Connections: Multi-stream residual with learned mixing.
+    Each layer maintains N parallel streams. The mixing matrix H combines
+    streams at each layer:
+        output = H @ input_streams + layer_contribution
+    Args:
+        n_streams: Number of parallel streams (N)
+        init_scale: Scale for random initialization of H
+    Shape:
+        - Input x: (B, N, D) where B=batch, N=streams, D=features
+        - Output: (B, N, D)
+    """
+    def __init__(self, n_streams: int = 4, init_scale: float = 0.1):
+        super().__init__()
+        self.n_streams = n_streams
+        # Raw mixing matrix - unconstrained for HC
+        self.H_res = nn.Parameter(torch.randn(n_streams, n_streams) * init_scale)
+    def get_mixing_matrix(self) -> torch.Tensor:
+        """Return the mixing matrix. Override in subclasses for constraints."""
+        return self.H_res
+    def forward(self, x: torch.Tensor, layer_output: torch.Tensor) -> torch.Tensor:
+        """
+        Apply HC mixing.
+        Args:
+            x: Streamed input of shape (B, N, ...)
+            layer_output: Output from layer F, shape (B, N, ...)
+        Returns:
+            Mixed output: H @ x + layer_output
+        """
+        H = self.get_mixing_matrix()
+        # H @ x using einsum for arbitrary trailing dimensions
+        mixed = torch.einsum('ij,bj...->bi...', H, x)
+        return mixed + layer_output
+class MHC(HyperConnections):
+    """
+    Manifold-Constrained Hyper-Connections (mHC).
+    Like HC, but the mixing matrix is projected to be doubly stochastic
+    via Sinkhorn-Knopp. This ensures:
+    - All eigenvalues bounded by 1
+    - Stable signal propagation through depth
+    - No gradient explosion
+    Args:
+        n_streams: Number of parallel streams
+        sinkhorn_iters: Number of Sinkhorn iterations for projection
+        init_scale: Scale for random initialization
+    """
+    def __init__(self, n_streams: int = 4, sinkhorn_iters: int = 20, init_scale: float = 0.1):
+        super().__init__(n_streams, init_scale)
+        self.sinkhorn_iters = sinkhorn_iters
+    def get_mixing_matrix(self) -> torch.Tensor:
+        """Return doubly stochastic mixing matrix via Sinkhorn projection."""
+        return sinkhorn_knopp_torch(self.H_res, self.sinkhorn_iters)
+class ResidualBlock(nn.Module):
+    """
+    Residual block with configurable connection type.
+    Supports three modes:
+    - 'baseline': Standard residual connection (x + F(x))
+    - 'hc': Hyper-Connections with unconstrained mixing
+    - 'mhc': Manifold-Constrained HC with doubly stochastic mixing
+    Args:
+        channels: Number of channels in conv layers
+        method: One of 'baseline', 'hc', 'mhc'
+        n_streams: Number of streams for HC/mHC
+        sinkhorn_iters: Sinkhorn iterations for mHC
+    """
+    def __init__(
+        self,
+        channels: int,
+        method: str = 'baseline',
+        n_streams: int = 4,
+        sinkhorn_iters: int = 20
+    ):
+        super().__init__()
+        self.method = method
+        self.n_streams = n_streams
+        # Main conv path (standard ResNet-style)
+        self.conv = nn.Sequential(
+            nn.Conv2d(channels, channels, 3, padding=1, bias=False),
+            nn.BatchNorm2d(channels),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(channels, channels, 3, padding=1, bias=False),
+            nn.BatchNorm2d(channels),
+        )
+        # HC/mHC mixing
+        if method == 'hc':
+            self.hc = HyperConnections(n_streams)
+        elif method == 'mhc':
+            self.hc = MHC(n_streams, sinkhorn_iters)
+        else:
+            self.hc = None
+        self.relu = nn.ReLU(inplace=True)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass with configurable residual connection.
+        Args:
+            x: Input tensor of shape (B, C, H, W)
+        Returns:
+            Output tensor of shape (B, C, H, W)
+        """
+        if self.method == 'baseline':
+            # Standard residual: x + F(x)
+            return self.relu(x + self.conv(x))
+        # HC/mHC path
+        B, C, H, W = x.shape
+        N = self.n_streams
+        # Expand input to streams: (B, C, H, W) -> (B, N, C*H*W)
+        # Using view instead of expand to avoid memory copy where possible
+        x_flat = x.view(B, 1, -1).expand(B, N, -1)
+        # Apply conv to original input
+        conv_out = self.conv(x)
+        conv_flat = conv_out.view(B, 1, -1).expand(B, N, -1)
+        # Mix via HC/mHC: H @ x_streams + conv_streams
+        mixed = self.hc(x_flat, conv_flat)
+        # Collapse streams: mean over N, reshape back
+        out = mixed.mean(dim=1).view(B, C, H, W)
+        return self.relu(out)
+class SimpleCNN(nn.Module):
+    """
+    Simple CNN with configurable residual connection type.
+    Architecture:
+    - Stem: 3x3 conv to channels
+    - N residual blocks (configurable connection type)
+    - Head: global avg pool + linear classifier
+    Args:
+        n_blocks: Number of residual blocks
+        channels: Hidden dimension
+        method: Residual type ('baseline', 'hc', 'mhc')
+        n_streams: Number of streams for HC/mHC
+        sinkhorn_iters: Sinkhorn iterations for mHC
+        num_classes: Number of output classes
+        in_channels: Number of input channels (3 for RGB)
+    """
+    def __init__(
+        self,
+        n_blocks: int = 8,
+        channels: int = 64,
+        method: str = 'baseline',
+        n_streams: int = 4,
+        sinkhorn_iters: int = 20,
+        num_classes: int = 10,
+        in_channels: int = 3
+    ):
+        super().__init__()
+        self.method = method
+        # Stem
+        self.stem = nn.Sequential(
+            nn.Conv2d(in_channels, channels, 3, padding=1, bias=False),
+            nn.BatchNorm2d(channels),
+            nn.ReLU(inplace=True),
+        )
+        # Residual blocks
+        self.blocks = nn.ModuleList([
+            ResidualBlock(channels, method, n_streams, sinkhorn_iters)
+            for _ in range(n_blocks)
+        ])
+        # Classification head
+        self.head = nn.Sequential(
+            nn.AdaptiveAvgPool2d(1),
+            nn.Flatten(),
+            nn.Linear(channels, num_classes),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass.
+        Args:
+            x: Input images of shape (B, C, H, W)
+        Returns:
+            Logits of shape (B, num_classes)
+        """
+        x = self.stem(x)
+        for block in self.blocks:
+            x = block(x)
+        return self.head(x)
+def train_with_gradient_tracking(
+    model: nn.Module,
+    train_loader,
+    epochs: int,
+    device: torch.device,
+    lr: float = 1e-3
+) -> dict:
+    """
+    Train model and record gradient magnitudes.
+    Args:
+        model: PyTorch model to train
+        train_loader: DataLoader for training data
+        epochs: Number of training epochs
+        device: Device to train on
+        lr: Learning rate
+    Returns:
+        Dict with training history:
+        - 'loss': List of loss values per step
+        - 'grad_norms': List of total gradient norms per step
+        - 'accuracy': List of batch accuracies per step
+    """
+    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
+    criterion = nn.CrossEntropyLoss()
+    history = {
+        'loss': [],
+        'grad_norms': [],
+        'accuracy': [],
+    }
+    model.train()
+    for epoch in range(epochs):
+        for data, target in train_loader:
+            data, target = data.to(device), target.to(device)
+            optimizer.zero_grad()
+            output = model(data)
+            loss = criterion(output, target)
+            loss.backward()
+            # Record gradient norm
+            total_norm = 0.0
+            for param in model.parameters():
+                if param.grad is not None:
+                    total_norm += param.grad.norm().item() ** 2
+            total_norm = total_norm ** 0.5
+            # Record accuracy
+            pred = output.argmax(dim=1)
+            acc = (pred == target).float().mean().item()
+            history['loss'].append(loss.item())
+            history['grad_norms'].append(total_norm)
+            history['accuracy'].append(acc)
+            optimizer.step()
+    return history

mhc/metrics.py ADDED Viewed

	@@ -0,0 +1,160 @@

+"""
+Stability metrics for analyzing residual mixing matrices.
+These metrics quantify how a matrix amplifies signals during forward/backward
+propagation through a neural network layer.
+Key insight from the mHC paper:
+- Unconstrained matrices (HC) can have unbounded gains, causing signal explosion
+- Doubly stochastic matrices (mHC) have all gains bounded by 1, ensuring stability
+Metrics:
+- forward_gain: Worst-case signal amplification in forward pass (max row sum)
+- backward_gain: Worst-case gradient amplification in backward pass (max column sum)
+- spectral_norm: Largest singular value (general operator norm)
+For doubly stochastic matrices, all three equal exactly 1.
+Author: Subhadip Mitra <contact@subhadipmitra.com>
+Based on DeepSeek's mHC paper: https://arxiv.org/abs/2512.24880
+"""
+import numpy as np
+def forward_gain(matrix: np.ndarray) -> float:
+    """
+    Compute maximum absolute row sum (worst-case signal amplification).
+    This is the infinity norm ||M||_∞, which equals the maximum amplification
+    a unit input vector can experience in the forward pass:
+        ||Mx||_∞ <= ||M||_∞ * ||x||_∞
+    For a doubly stochastic matrix, all row sums equal 1, so forward_gain = 1.
+    For unconstrained matrices, can be arbitrarily large.
+    Args:
+        matrix: Input matrix of shape (n, n)
+    Returns:
+        Maximum absolute row sum: max_i |sum_j M[i,j]|
+    Example:
+        >>> forward_gain(np.eye(4))
+        1.0
+        >>> forward_gain(np.ones((4, 4)))
+        4.0
+    """
+    return float(np.abs(matrix.sum(axis=1)).max())
+def backward_gain(matrix: np.ndarray) -> float:
+    """
+    Compute maximum absolute column sum (worst-case gradient amplification).
+    This is the one norm ||M||_1, which equals the maximum amplification
+    a gradient vector can experience in the backward pass:
+        ||M^T g||_1 <= ||M||_1 * ||g||_1
+    For a doubly stochastic matrix, all column sums equal 1, so backward_gain = 1.
+    For unconstrained matrices, can be arbitrarily large.
+    Args:
+        matrix: Input matrix of shape (n, n)
+    Returns:
+        Maximum absolute column sum: max_j |sum_i M[i,j]|
+    Example:
+        >>> backward_gain(np.eye(4))
+        1.0
+        >>> backward_gain(np.ones((4, 4)))
+        4.0
+    """
+    return float(np.abs(matrix.sum(axis=0)).max())
+def spectral_norm(matrix: np.ndarray, iterations: int = 20) -> float:
+    """
+    Estimate spectral norm (largest singular value) via power iteration.
+    The spectral norm ||M||_2 is the maximum amplification of a unit vector
+    in the L2 sense. For any input x with ||x||_2 = 1:
+        ||Mx||_2 <= ||M||_2
+    For doubly stochastic matrices, spectral_norm <= 1.
+    Algorithm (power iteration):
+    1. Start with random unit vector v
+    2. Iterate: v = M @ v / ||M @ v||
+    3. Estimate: ||M @ v|| converges to largest singular value
+    Args:
+        matrix: Input matrix of shape (n, n)
+        iterations: Number of power iterations (20 is usually sufficient)
+    Returns:
+        Estimated spectral norm (largest singular value)
+    Example:
+        >>> spectral_norm(np.eye(4))  # doctest: +ELLIPSIS
+        1.0...
+        >>> spectral_norm(2 * np.eye(4))  # doctest: +ELLIPSIS
+        2.0...
+    """
+    n = matrix.shape[0]
+    # Initialize with random unit vector
+    rng = np.random.default_rng(42)  # Fixed seed for reproducibility
+    v = rng.standard_normal(n)
+    v = v / np.linalg.norm(v)
+    for _ in range(iterations):
+        # Power iteration: v = M @ v, then normalize
+        w = matrix @ v
+        norm = np.linalg.norm(w)
+        if norm < 1e-10:
+            return 0.0
+        v = w / norm
+    # Final estimate: ||M @ v||
+    return float(np.linalg.norm(matrix @ v))
+def compute_all_metrics(matrix: np.ndarray) -> dict:
+    """
+    Compute all stability metrics for a matrix.
+    This is the main function for analyzing residual mixing matrices.
+    It returns all metrics needed to assess training stability.
+    Args:
+        matrix: Input matrix of shape (n, n)
+    Returns:
+        Dict containing:
+        - spectral_norm: Largest singular value
+        - forward_gain: Max absolute row sum
+        - backward_gain: Max absolute column sum
+        - row_sum_max_dev: Max deviation of row sums from 1
+        - col_sum_max_dev: Max deviation of column sums from 1
+        - min_entry: Minimum matrix entry
+    Example:
+        >>> metrics = compute_all_metrics(np.eye(4))
+        >>> metrics['forward_gain']
+        1.0
+        >>> metrics['backward_gain']
+        1.0
+    """
+    row_sums = matrix.sum(axis=1)
+    col_sums = matrix.sum(axis=0)
+    return {
+        'spectral_norm': spectral_norm(matrix),
+        'forward_gain': float(np.abs(row_sums).max()),
+        'backward_gain': float(np.abs(col_sums).max()),
+        'row_sum_max_dev': float(np.abs(row_sums - 1).max()),
+        'col_sum_max_dev': float(np.abs(col_sums - 1).max()),
+        'min_entry': float(matrix.min()),
+    }

mhc/simulation.py ADDED Viewed

	@@ -0,0 +1,211 @@

+"""
+Simulation engine for deep network signal propagation.
+This module simulates how signals propagate through deep residual networks
+with different residual mixing strategies:
+- baseline: Identity matrices (no mixing, standard residual connections)
+- hc: Random unconstrained matrices (Hyper-Connections)
+- mhc: Sinkhorn-projected doubly stochastic matrices (Manifold-Constrained HC)
+Key insight from the mHC paper:
+The COMPOSITE mapping (product of all layer matrices H_L @ H_{L-1} @ ... @ H_0)
+is what matters for signal propagation:
+- For HC: composite gains explode exponentially (3000x+ at depth 64)
+- For mHC: composite gains stay bounded (~1.6x at depth 64)
+This happens because doubly stochastic matrices are closed under multiplication.
+Author: Subhadip Mitra <contact@subhadipmitra.com>
+Based on DeepSeek's mHC paper: https://arxiv.org/abs/2512.24880
+"""
+import numpy as np
+from typing import Dict, Literal, Optional
+from .sinkhorn import sinkhorn_knopp
+from .metrics import compute_all_metrics
+def generate_residual_matrix(
+    n: int,
+    method: Literal['baseline', 'hc', 'mhc'],
+    sinkhorn_iters: int = 20,
+    rng: Optional[np.random.Generator] = None
+) -> np.ndarray:
+    """
+    Generate a residual mixing matrix.
+    Args:
+        n: Size of square matrix (number of streams)
+        method: One of:
+            - 'baseline': Identity matrix (no mixing)
+            - 'hc': Random matrix with N(0, 1) entries
+            - 'mhc': Random matrix projected to doubly stochastic via Sinkhorn
+        sinkhorn_iters: Number of Sinkhorn iterations for mHC method
+        rng: Random number generator for reproducibility
+    Returns:
+        Residual mixing matrix of shape (n, n)
+    Example:
+        >>> rng = np.random.default_rng(42)
+        >>> M = generate_residual_matrix(4, 'mhc', sinkhorn_iters=20, rng=rng)
+        >>> M.shape
+        (4, 4)
+    """
+    if rng is None:
+        rng = np.random.default_rng()
+    if method == 'baseline':
+        return np.eye(n)
+    # Generate random matrix for HC and mHC
+    M = rng.standard_normal((n, n))
+    if method == 'hc':
+        return M
+    if method == 'mhc':
+        # At k=0, return raw random matrix (same as HC) to show explosive behavior
+        # At k>0, apply Sinkhorn projection to show transition to stability
+        if sinkhorn_iters == 0:
+            return M
+        return sinkhorn_knopp(M, iterations=sinkhorn_iters)
+    raise ValueError(f"Unknown method: {method}. Expected 'baseline', 'hc', or 'mhc'.")
+def simulate_depth(
+    depth: int,
+    n: int,
+    method: Literal['baseline', 'hc', 'mhc'],
+    sinkhorn_iters: int = 20,
+    seed: int = 42
+) -> Dict:
+    """
+    Simulate signal propagation through a deep residual network.
+    This function generates `depth` residual matrices and computes both
+    per-layer metrics and cumulative composite metrics at each depth.
+    The composite mapping at layer l is:
+        Composite(l) = H_l @ H_{l-1} @ ... @ H_1 @ H_0
+    This represents the total transformation applied to signals from
+    the input to layer l.
+    Args:
+        depth: Number of layers to simulate
+        n: Matrix size (number of streams in multi-stream residual)
+        method: Residual mixing strategy ('baseline', 'hc', or 'mhc')
+        sinkhorn_iters: Number of Sinkhorn iterations for mHC
+        seed: Random seed for reproducibility
+    Returns:
+        Dict containing:
+        - 'method': str - the method used
+        - 'depth': int - number of layers
+        - 'n': int - matrix size
+        - 'sinkhorn_iters': int - Sinkhorn iterations used
+        - 'seed': int - random seed used
+        - 'per_layer': list of dicts with metrics for each layer's matrix
+        - 'composite': list of dicts with metrics for composite at each depth
+    Example:
+        >>> result = simulate_depth(64, 4, 'mhc', seed=42)
+        >>> result['composite'][-1]['forward_gain'] < 5
+        True
+    """
+    rng = np.random.default_rng(seed)
+    per_layer = []
+    composite_metrics = []
+    composite = np.eye(n)  # Start with identity
+    for layer_idx in range(depth):
+        # Generate this layer's residual matrix
+        H = generate_residual_matrix(n, method, sinkhorn_iters, rng)
+        # Store per-layer metrics
+        per_layer.append({
+            'layer': layer_idx,
+            **compute_all_metrics(H)
+        })
+        # Update composite: multiply from the left
+        # Composite(l) = H_l @ Composite(l-1) = H_l @ H_{l-1} @ ... @ H_0
+        composite = H @ composite
+        # Store composite metrics at this depth
+        composite_metrics.append({
+            'upto_layer': layer_idx,
+            **compute_all_metrics(composite)
+        })
+    return {
+        'method': method,
+        'depth': depth,
+        'n': n,
+        'sinkhorn_iters': sinkhorn_iters,
+        'seed': seed,
+        'per_layer': per_layer,
+        'composite': composite_metrics,
+    }
+def run_comparison(
+    depth: int = 64,
+    n: int = 4,
+    sinkhorn_iters: int = 20,
+    seed: int = 42
+) -> Dict:
+    """
+    Run simulation for all three methods and return comparison.
+    This is the main entry point for generating comparison data.
+    It runs simulate_depth for baseline, HC, and mHC with the same
+    parameters, making direct comparison possible.
+    Args:
+        depth: Number of layers to simulate
+        n: Matrix size (number of streams)
+        sinkhorn_iters: Number of Sinkhorn iterations for mHC
+        seed: Random seed (same seed used for all methods for fair comparison)
+    Returns:
+        Dict with keys 'baseline', 'hc', 'mhc' containing simulation results
+    Example:
+        >>> results = run_comparison(depth=64, n=4, seed=42)
+        >>> # Baseline should stay at 1
+        >>> results['baseline']['composite'][-1]['forward_gain']
+        1.0
+        >>> # HC should explode
+        >>> results['hc']['composite'][-1]['forward_gain'] > 10
+        True
+        >>> # mHC should stay bounded
+        >>> results['mhc']['composite'][-1]['forward_gain'] < 5
+        True
+    """
+    return {
+        'baseline': simulate_depth(depth, n, 'baseline', sinkhorn_iters, seed),
+        'hc': simulate_depth(depth, n, 'hc', sinkhorn_iters, seed),
+        'mhc': simulate_depth(depth, n, 'mhc', sinkhorn_iters, seed),
+    }
+if __name__ == "__main__":
+    # Quick demo when run directly
+    print("Running mHC simulation comparison...")
+    print("=" * 50)
+    results = run_comparison(depth=64, n=4, seed=42)
+    for method in ['baseline', 'hc', 'mhc']:
+        final_composite = results[method]['composite'][-1]
+        print(f"\n{method.upper()}:")
+        print(f"  Final composite forward_gain:  {final_composite['forward_gain']:.4f}")
+        print(f"  Final composite backward_gain: {final_composite['backward_gain']:.4f}")
+        print(f"  Final composite spectral_norm: {final_composite['spectral_norm']:.4f}")

mhc/sinkhorn.py ADDED Viewed

	@@ -0,0 +1,148 @@

+"""
+Sinkhorn-Knopp algorithm for projecting matrices onto doubly stochastic matrices.
+A doubly stochastic matrix has:
+- All entries >= 0
+- All rows sum to 1
+- All columns sum to 1
+The Sinkhorn-Knopp algorithm projects any matrix onto this set by:
+1. Exponentiating the matrix to make all entries positive
+2. Alternating row and column normalization until convergence
+Mathematical background:
+The set of doubly stochastic matrices forms the Birkhoff polytope. Sinkhorn-Knopp
+finds the unique doubly stochastic matrix of the form D1 * exp(M) * D2 where
+D1 and D2 are diagonal matrices with positive entries.
+Key property for mHC: The product of doubly stochastic matrices is also
+doubly stochastic (closure under multiplication), which bounds composite gains.
+Author: Subhadip Mitra <contact@subhadipmitra.com>
+Based on DeepSeek's mHC paper: https://arxiv.org/abs/2512.24880
+"""
+import numpy as np
+def sinkhorn_knopp(matrix: np.ndarray, iterations: int = 20, eps: float = 1e-8) -> np.ndarray:
+    """
+    Project a matrix onto the set of doubly stochastic matrices.
+    Algorithm:
+    1. P = exp(matrix - max(matrix)) for numerical stability
+    2. For each iteration:
+       a. Normalize rows: P[i,:] = P[i,:] / sum(P[i,:])
+       b. Normalize columns: P[:,j] = P[:,j] / sum(P[:,j])
+    3. Return P
+    Args:
+        matrix: Input matrix of shape (n, n). Can have any real values.
+        iterations: Number of normalization iterations. 20 is typically
+                   sufficient for 1e-3 accuracy.
+        eps: Small value for numerical stability to prevent division by zero.
+    Returns:
+        Approximately doubly stochastic matrix of shape (n, n) where:
+        - All entries are >= 0
+        - All row sums are approximately 1
+        - All column sums are approximately 1
+    Example:
+        >>> M = np.random.randn(4, 4)
+        >>> P = sinkhorn_knopp(M, iterations=20)
+        >>> np.allclose(P.sum(axis=1), 1, atol=1e-3)
+        True
+        >>> np.allclose(P.sum(axis=0), 1, atol=1e-3)
+        True
+    """
+    # Subtract max for numerical stability before exponentiation
+    # This prevents overflow when matrix has large positive values
+    P = np.exp(matrix - matrix.max())
+    for _ in range(iterations):
+        # Row normalization: make each row sum to 1
+        row_sums = P.sum(axis=1, keepdims=True)
+        P = P / np.maximum(row_sums, eps)
+        # Column normalization: make each column sum to 1
+        col_sums = P.sum(axis=0, keepdims=True)
+        P = P / np.maximum(col_sums, eps)
+    return P
+def is_doubly_stochastic(matrix: np.ndarray, tol: float = 1e-3) -> bool:
+    """
+    Check if a matrix is approximately doubly stochastic.
+    A matrix is doubly stochastic if:
+    - All entries are non-negative
+    - All row sums equal 1
+    - All column sums equal 1
+    Args:
+        matrix: Input matrix to check, shape (n, n)
+        tol: Tolerance for row/column sum deviation from 1.0
+    Returns:
+        True if matrix satisfies all doubly stochastic conditions
+        within the given tolerance.
+    Example:
+        >>> I = np.eye(4)
+        >>> is_doubly_stochastic(I)
+        True
+        >>> M = np.random.randn(4, 4)
+        >>> is_doubly_stochastic(M)
+        False
+    """
+    # Check non-negativity
+    if matrix.min() < -tol:
+        return False
+    # Check row sums
+    row_sums = matrix.sum(axis=1)
+    if not np.allclose(row_sums, 1.0, atol=tol):
+        return False
+    # Check column sums
+    col_sums = matrix.sum(axis=0)
+    if not np.allclose(col_sums, 1.0, atol=tol):
+        return False
+    return True
+def projection_error(matrix: np.ndarray) -> dict:
+    """
+    Compute how far a matrix is from being doubly stochastic.
+    This is useful for:
+    - Verifying Sinkhorn-Knopp convergence
+    - Debugging numerical issues
+    - Visualizing the projection process
+    Args:
+        matrix: Input matrix to analyze, shape (n, n)
+    Returns:
+        Dict containing:
+        - 'row_sum_max_dev': Maximum absolute deviation of any row sum from 1
+        - 'col_sum_max_dev': Maximum absolute deviation of any column sum from 1
+        - 'min_entry': Minimum entry in the matrix (should be >= 0 for DS)
+    Example:
+        >>> P = sinkhorn_knopp(np.random.randn(4, 4), iterations=20)
+        >>> err = projection_error(P)
+        >>> err['row_sum_max_dev'] < 1e-3
+        True
+    """
+    row_sums = matrix.sum(axis=1)
+    col_sums = matrix.sum(axis=0)
+    return {
+        'row_sum_max_dev': float(np.abs(row_sums - 1.0).max()),
+        'col_sum_max_dev': float(np.abs(col_sums - 1.0).max()),
+        'min_entry': float(matrix.min()),
+    }

mhc/torch_module.py ADDED Viewed

	@@ -0,0 +1,280 @@

+"""
+PyTorch implementation of mHC (Manifold-Constrained Hyper-Connections).
+This module provides differentiable implementations that can be used
+directly in neural network training:
+- SinkhornKnopp: Differentiable projection onto doubly stochastic matrices
+- mHCResidual: Complete mHC residual connection module
+- mHCBlock: Wrapper to add mHC residuals to any layer
+Usage:
+    # Wrap any layer with mHC residuals
+    layer = nn.Linear(256, 256)
+    mhc_layer = mHCBlock(layer, dim=256, n_streams=4)
+    # Forward pass
+    x = torch.randn(32, 4, 256)  # (batch, n_streams, dim)
+    output = mhc_layer(x)
+Author: Subhadip Mitra <contact@subhadipmitra.com>
+Based on DeepSeek's mHC paper: https://arxiv.org/abs/2512.24880
+"""
+import torch
+import torch.nn as nn
+from typing import Optional
+class SinkhornKnopp(nn.Module):
+    """
+    Differentiable Sinkhorn-Knopp projection onto doubly stochastic matrices.
+    Projects any matrix onto the Birkhoff polytope (set of doubly stochastic
+    matrices) using alternating row and column normalization.
+    Args:
+        iterations: Number of normalization iterations (default: 20)
+        eps: Small value for numerical stability (default: 1e-8)
+    Example:
+        >>> sinkhorn = SinkhornKnopp(iterations=20)
+        >>> M = torch.randn(4, 4)
+        >>> P = sinkhorn(M)
+        >>> P.sum(dim=1)  # Should be close to 1
+        tensor([1., 1., 1., 1.])
+    """
+    def __init__(self, iterations: int = 20, eps: float = 1e-8):
+        super().__init__()
+        self.iterations = iterations
+        self.eps = eps
+    def forward(self, matrix: torch.Tensor) -> torch.Tensor:
+        """
+        Project matrix onto doubly stochastic matrices.
+        Args:
+            matrix: Input tensor of shape (..., n, n)
+        Returns:
+            Approximately doubly stochastic matrix of same shape
+        """
+        # Subtract max for numerical stability before exp
+        P = torch.exp(matrix - matrix.max())
+        for _ in range(self.iterations):
+            # Row normalization
+            P = P / (P.sum(dim=-1, keepdim=True) + self.eps)
+            # Column normalization
+            P = P / (P.sum(dim=-2, keepdim=True) + self.eps)
+        return P
+class mHCResidual(nn.Module):
+    """
+    Manifold-Constrained Hyper-Connection residual module.
+    Implements the mHC residual connection with learnable mixing matrices
+    that are projected onto doubly stochastic matrices via Sinkhorn-Knopp.
+    The module maintains multiple "streams" of hidden states and mixes them
+    using constrained matrices to ensure stable signal propagation.
+    Args:
+        dim: Hidden dimension size
+        n_streams: Number of parallel streams (default: 4)
+        sinkhorn_iters: Number of Sinkhorn iterations (default: 20)
+    Example:
+        >>> mhc = mHCResidual(dim=256, n_streams=4)
+        >>> x = torch.randn(32, 4, 256)  # (batch, n_streams, dim)
+        >>> layer_out = torch.randn(32, 256)  # Output from layer F
+        >>> y = mhc(x, layer_out)
+        >>> y.shape
+        torch.Size([32, 4, 256])
+    """
+    def __init__(
+        self,
+        dim: int,
+        n_streams: int = 4,
+        sinkhorn_iters: int = 20
+    ):
+        super().__init__()
+        self.dim = dim
+        self.n_streams = n_streams
+        # Sinkhorn projection
+        self.sinkhorn = SinkhornKnopp(iterations=sinkhorn_iters)
+        # Learnable mixing matrices (before projection)
+        # H_res: mixing within residual streams
+        self.H_res = nn.Parameter(torch.randn(n_streams, n_streams) * 0.01)
+        # H_pre: aggregating streams to layer input (1 x n_streams)
+        self.H_pre = nn.Parameter(torch.ones(1, n_streams) / n_streams)
+        # H_post: distributing layer output to streams (n_streams x 1)
+        self.H_post = nn.Parameter(torch.ones(n_streams, 1) / n_streams)
+        # Learnable gating scalars (initialized small for stable training)
+        self.alpha_res = nn.Parameter(torch.tensor(0.01))
+        self.alpha_pre = nn.Parameter(torch.tensor(0.01))
+        self.alpha_post = nn.Parameter(torch.tensor(0.01))
+        # Bias terms
+        self.bias_res = nn.Parameter(torch.zeros(n_streams, dim))
+        self.bias_post = nn.Parameter(torch.zeros(n_streams, dim))
+    def forward(
+        self,
+        x: torch.Tensor,
+        layer_output: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Apply mHC residual connection.
+        Args:
+            x: Input hidden state, shape (batch, n_streams, dim)
+            layer_output: Output from layer function F, shape (batch, dim)
+        Returns:
+            Updated hidden state, shape (batch, n_streams, dim)
+        """
+        batch_size = x.shape[0]
+        # Project H_res onto doubly stochastic
+        H_res_proj = self.sinkhorn(self.H_res)
+        # Mix residual streams: (batch, n_streams, dim) @ (n_streams, n_streams)^T
+        # Equivalent to applying H_res to each position
+        x_mixed = torch.einsum('bsd,rs->brd', x, H_res_proj)
+        # Scale by alpha_res and add bias
+        x_mixed = self.alpha_res * x_mixed + self.bias_res
+        # Distribute layer output to streams
+        # layer_output: (batch, dim) -> (batch, n_streams, dim)
+        layer_contrib = layer_output.unsqueeze(1) * self.H_post  # (batch, n_streams, dim)
+        layer_contrib = self.alpha_post * layer_contrib + self.bias_post
+        # Combine: residual mixing + layer contribution + original input
+        output = x + x_mixed + layer_contrib
+        return output
+    def get_aggregated_input(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Aggregate multi-stream input for layer function.
+        Args:
+            x: Hidden state, shape (batch, n_streams, dim)
+        Returns:
+            Aggregated input for layer, shape (batch, dim)
+        """
+        # Weighted sum across streams
+        # H_pre: (1, n_streams), x: (batch, n_streams, dim)
+        aggregated = torch.einsum('bsd,os->bd', x, self.H_pre.abs())
+        return self.alpha_pre * aggregated
+class mHCBlock(nn.Module):
+    """
+    Wrapper that adds mHC residual connections to any layer.
+    This is the main interface for using mHC in your models. It wraps
+    any PyTorch module (e.g., Linear, Attention) with mHC residuals.
+    Args:
+        layer: The layer module to wrap (e.g., nn.Linear)
+        dim: Hidden dimension
+        n_streams: Number of parallel streams (default: 4)
+        sinkhorn_iters: Number of Sinkhorn iterations (default: 20)
+    Example:
+        >>> # Wrap a linear layer
+        >>> layer = nn.Linear(256, 256)
+        >>> block = mHCBlock(layer, dim=256, n_streams=4)
+        >>>
+        >>> # Input has shape (batch, n_streams, dim)
+        >>> x = torch.randn(32, 4, 256)
+        >>> output = block(x)
+        >>> output.shape
+        torch.Size([32, 4, 256])
+    """
+    def __init__(
+        self,
+        layer: nn.Module,
+        dim: int,
+        n_streams: int = 4,
+        sinkhorn_iters: int = 20
+    ):
+        super().__init__()
+        self.layer = layer
+        self.mhc = mHCResidual(dim, n_streams, sinkhorn_iters)
+        self.n_streams = n_streams
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass with mHC residual.
+        Args:
+            x: Input tensor, shape (batch, n_streams, dim)
+        Returns:
+            Output tensor, shape (batch, n_streams, dim)
+        """
+        # Aggregate streams for layer input
+        layer_input = self.mhc.get_aggregated_input(x)
+        # Apply the wrapped layer
+        layer_output = self.layer(layer_input)
+        # Apply mHC residual
+        output = self.mhc(x, layer_output)
+        return output
+def create_mhc_mlp(
+    dim: int,
+    n_layers: int,
+    n_streams: int = 4,
+    sinkhorn_iters: int = 20
+) -> nn.Sequential:
+    """
+    Create an MLP with mHC residual connections.
+    Convenience function to create a multi-layer perceptron where
+    each layer is wrapped with mHC residuals. All layers maintain
+    the same dimension for mHC stream compatibility.
+    Args:
+        dim: Hidden dimension (constant throughout)
+        n_layers: Number of mHC blocks
+        n_streams: Number of mHC streams
+        sinkhorn_iters: Sinkhorn iterations
+    Returns:
+        nn.Sequential module with mHC blocks
+    Example:
+        >>> mlp = create_mhc_mlp(dim=256, n_layers=4)
+        >>> x = torch.randn(32, 4, 256)  # (batch, n_streams, dim)
+        >>> y = mlp(x)
+        >>> y.shape
+        torch.Size([32, 4, 256])
+    """
+    layers = []
+    for i in range(n_layers):
+        layer = nn.Linear(dim, dim)
+        layers.append(mHCBlock(layer, dim, n_streams, sinkhorn_iters))
+        if i < n_layers - 1:
+            layers.append(nn.GELU())
+    return nn.Sequential(*layers)

requirements.txt CHANGED Viewed

@@ -1,6 +1,4 @@
-marimo
 torch
-# Or a specific version
-# marimo>=0.9.0
-# Add other dependencies as needed

+numpy
 torch
+matplotlib
+pytest