WCNegentropy
/

BitTransformerLM

+"""
+Muon Optimizer for BitTransformerLM Extensions
+==============================================
+Implementation of the Muon optimizer with orthogonal momentum updates.
+Based on "Muon: Momentum Orthogonalized by Newton's method" research.
+Key features:
+- Orthogonal momentum updates
+- Better convergence properties than Adam/AdamW
+- Memory efficient implementation
+- Compatible with BitTransformerLM's training infrastructure
+"""
+import math
+import torch
+from torch.optim.optimizer import Optimizer
+from typing import Any, Dict, List, Optional, Tuple, Union
+import warnings
+class Muon(Optimizer):
+    """
+    Muon optimizer with orthogonal momentum updates.
+    This implementation provides momentum updates that are orthogonalized using
+    Newton's method, leading to more stable training dynamics.
+    Args:
+        params: Iterable of parameters to optimize
+        lr: Learning rate (default: 1e-3)
+        momentum: Momentum factor (default: 0.95)
+        nesterov: Enable Nesterov momentum (default: False)
+        backend: Backend for orthogonalization ('newtonschulz' or 'svd')
+        update_period: Period for updating orthogonalization (default: 1)
+        rank_deficiency_threshold: Threshold for rank deficiency detection
+        eps: Small constant for numerical stability (default: 1e-8)
+        weight_decay: Weight decay coefficient (default: 0.0)
+    """
+    def __init__(
+        self,
+        params,
+        lr: float = 1e-3,
+        momentum: float = 0.95,
+        nesterov: bool = False,
+        backend: str = "newtonschulz",
+        update_period: int = 1,
+        rank_deficiency_threshold: float = 1e-6,
+        eps: float = 1e-8,
+        weight_decay: float = 0.0,
+    ):
+        if not 0.0 <= lr:
+            raise ValueError(f"Invalid learning rate: {lr}")
+        if not 0.0 <= momentum <= 1.0:
+            raise ValueError(f"Invalid momentum value: {momentum}")
+        if not 0.0 <= weight_decay:
+            raise ValueError(f"Invalid weight_decay value: {weight_decay}")
+        if backend not in ["newtonschulz", "svd"]:
+            raise ValueError(f"Invalid backend: {backend}")
+        defaults = dict(
+            lr=lr,
+            momentum=momentum,
+            nesterov=nesterov,
+            backend=backend,
+            update_period=update_period,
+            rank_deficiency_threshold=rank_deficiency_threshold,
+            eps=eps,
+            weight_decay=weight_decay,
+        )
+        super().__init__(params, defaults)
+    def _orthogonalize_newtonschulz(self, matrix: torch.Tensor, num_iterations: int = 5) -> torch.Tensor:
+        """Orthogonalize matrix using Newton-Schulz iteration."""
+        # Handle different shapes
+        original_shape = matrix.shape
+        if matrix.dim() > 2:
+            matrix = matrix.view(-1, matrix.shape[-1])
+        if matrix.shape[0] >= matrix.shape[1]:
+            # Tall matrix - orthogonalize columns
+            X = matrix.clone()
+            for _ in range(num_iterations):
+                A = X.T @ X
+                X = X @ (1.5 * torch.eye(A.shape[0], device=A.device, dtype=A.dtype) - 0.5 * A)
+        else:
+            # Wide matrix - orthogonalize rows
+            X = matrix.clone()
+            for _ in range(num_iterations):
+                A = X @ X.T
+                X = (1.5 * torch.eye(A.shape[0], device=A.device, dtype=A.dtype) - 0.5 * A) @ X
+        return X.view(original_shape)
+    def _orthogonalize_svd(self, matrix: torch.Tensor) -> torch.Tensor:
+        """Orthogonalize matrix using SVD decomposition."""
+        original_shape = matrix.shape
+        if matrix.dim() > 2:
+            matrix = matrix.view(-1, matrix.shape[-1])
+        try:
+            U, _, Vt = torch.linalg.svd(matrix, full_matrices=False)
+            orthogonal = U @ Vt
+            return orthogonal.view(original_shape)
+        except torch._C._LinAlgError:
+            # Fallback to Newton-Schulz if SVD fails
+            warnings.warn("SVD failed, falling back to Newton-Schulz")
+            return self._orthogonalize_newtonschulz(matrix)
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Perform a single optimization step."""
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+        for group in self.param_groups:
+            for p in group["params"]:
+                if p.grad is None:
+                    continue
+                grad = p.grad
+                if grad.dtype in {torch.float16, torch.bfloat16}:
+                    grad = grad.float()
+                state = self.state[p]
+                # State initialization
+                if len(state) == 0:
+                    state["step"] = 0
+                    state["momentum_buffer"] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                momentum_buffer = state["momentum_buffer"]
+                state["step"] += 1
+                # Weight decay
+                if group["weight_decay"] != 0:
+                    grad = grad.add(p, alpha=group["weight_decay"])
+                # Apply momentum
+                momentum_buffer.mul_(group["momentum"]).add_(grad)
+                # Orthogonalize momentum every update_period steps
+                if state["step"] % group["update_period"] == 0 and momentum_buffer.numel() > 1:
+                    # Only orthogonalize if we have sufficient dimensions
+                    if momentum_buffer.dim() >= 2 and min(momentum_buffer.shape[-2:]) > 1:
+                        if group["backend"] == "newtonschulz":
+                            orthogonal_momentum = self._orthogonalize_newtonschulz(momentum_buffer)
+                        else:
+                            orthogonal_momentum = self._orthogonalize_svd(momentum_buffer)
+                        # Check for rank deficiency
+                        rank_ratio = torch.linalg.matrix_norm(orthogonal_momentum) / torch.linalg.matrix_norm(momentum_buffer)
+                        if rank_ratio < group["rank_deficiency_threshold"]:
+                            warnings.warn("Detected rank deficiency in momentum buffer")
+                        else:
+                            momentum_buffer.copy_(orthogonal_momentum)
+                # Apply Nesterov acceleration if enabled
+                if group["nesterov"]:
+                    update = grad.add(momentum_buffer, alpha=group["momentum"])
+                else:
+                    update = momentum_buffer
+                # Apply update
+                p.add_(update, alpha=-group["lr"])
+        return loss
+def configure_muon_optimizer(
+    model: torch.nn.Module,
+    lr: float = 1e-3,
+    momentum: float = 0.95,
+    weight_decay: float = 0.01,
+    total_steps: Optional[int] = None,
+    warmup_ratio: float = 0.1,
+    nesterov: bool = False,
+    backend: str = "newtonschulz",
+    **muon_kwargs
+) -> Tuple[Muon, Optional[torch.optim.lr_scheduler._LRScheduler]]:
+    """
+    Configure Muon optimizer with OneCycle learning rate schedule.
+    This function provides a drop-in replacement for BitTransformerLM's
+    configure_optimizer function, using Muon instead of AdamW.
+    Args:
+        model: PyTorch model to optimize
+        lr: Peak learning rate
+        momentum: Momentum factor for Muon
+        weight_decay: Weight decay coefficient
+        total_steps: Total training steps for OneCycle schedule
+        warmup_ratio: Fraction of steps for warmup
+        nesterov: Enable Nesterov momentum
+        backend: Orthogonalization backend
+        **muon_kwargs: Additional arguments for Muon optimizer
+    Returns:
+        Tuple of (optimizer, scheduler)
+    """
+    # Filter parameters that need weight decay
+    decay_params = []
+    no_decay_params = []
+    for name, param in model.named_parameters():
+        if not param.requires_grad:
+            continue
+        # Apply weight decay to weights but not biases/norms
+        if param.dim() >= 2:
+            decay_params.append(param)
+        else:
+            no_decay_params.append(param)
+    param_groups = [
+        {"params": decay_params, "weight_decay": weight_decay},
+        {"params": no_decay_params, "weight_decay": 0.0},
+    ]
+    optimizer = Muon(
+        param_groups,
+        lr=lr,
+        momentum=momentum,
+        nesterov=nesterov,
+        backend=backend,
+        **muon_kwargs
+    )
+    scheduler = None
+    if total_steps is not None and total_steps > 0:
+        scheduler = torch.optim.lr_scheduler.OneCycleLR(
+            optimizer,
+            max_lr=lr,
+            total_steps=total_steps,
+            pct_start=warmup_ratio,
+            anneal_strategy='cos',
+            cycle_momentum=False,  # Muon handles momentum internally
+            div_factor=25.0,
+            final_div_factor=1e4,
+        )
+    return optimizer, scheduler
+def create_muon_training_config(
+    lr: float = 1e-3,
+    momentum: float = 0.95,
+    weight_decay: float = 0.01,
+    backend: str = "newtonschulz",
+    nesterov: bool = False,
+    **kwargs
+) -> Dict[str, Any]:
+    """
+    Create a training configuration dictionary for Muon optimizer.
+    This can be used with BitTransformerLM's training scripts by passing
+    the config to the training loop.
+    Args:
+        lr: Learning rate
+        momentum: Momentum factor
+        weight_decay: Weight decay coefficient
+        backend: Orthogonalization backend
+        nesterov: Enable Nesterov momentum
+        **kwargs: Additional configuration options
+    Returns:
+        Dictionary containing training configuration
+    """
+    config = {
+        "optimizer_type": "muon",
+        "optimizer_config": {
+            "lr": lr,
+            "momentum": momentum,
+            "weight_decay": weight_decay,
+            "backend": backend,
+            "nesterov": nesterov,
+            **kwargs
+        },
+        "scheduler_type": "onecycle",
+    }
+    return config
+# Example usage and integration helpers
+def integrate_with_bittransformerlm():
+    """
+    Example of how to integrate Muon optimizer with BitTransformerLM training.
+    Usage:
+        from BTLM_Extensions.muon_optimizer import configure_muon_optimizer
+        # Replace the standard optimizer configuration
+        optimizer, scheduler = configure_muon_optimizer(
+            model, lr=1e-3, momentum=0.95, total_steps=1000
+        )
+        # Use in training loop
+        train_loop(model, data, optimizer=optimizer, scheduler=scheduler)
+    """
+    pass
+if __name__ == "__main__":
+    # Simple test of the optimizer
+    import torch.nn as nn
+    model = nn.Sequential(
+        nn.Linear(10, 20),
+        nn.ReLU(),
+        nn.Linear(20, 1)
+    )
+    optimizer, scheduler = configure_muon_optimizer(model, lr=1e-3, total_steps=100)
+    # Simple training step
+    x = torch.randn(32, 10)
+    y = torch.randn(32, 1)
+    pred = model(x)
+    loss = nn.functional.mse_loss(pred, y)
+    loss.backward()
+    optimizer.step()
+    if scheduler:
+        scheduler.step()
+    print("Muon optimizer test completed successfully!")
+    print(f"Loss: {loss.item():.4f}")