BitTransformerLM / bit_transformer /BTLM_Extensions /muon_optimizer.py

🚀 Refined BitTransformerLM: Organized codebase with best practices

1e5dcf2 verified 5 months ago

11.4 kB

	"""
	Muon Optimizer for BitTransformerLM Extensions
	==============================================

	Implementation of the Muon optimizer with orthogonal momentum updates.
	Based on "Muon: Momentum Orthogonalized by Newton's method" research.

	Key features:
	- Orthogonal momentum updates
	- Better convergence properties than Adam/AdamW
	- Memory efficient implementation
	- Compatible with BitTransformerLM's training infrastructure
	"""

	import math
	import torch
	from torch.optim.optimizer import Optimizer
	from typing import Any, Dict, List, Optional, Tuple, Union
	import warnings


	class Muon(Optimizer):
	"""
	Muon optimizer with orthogonal momentum updates.

	This implementation provides momentum updates that are orthogonalized using
	Newton's method, leading to more stable training dynamics.

	Args:
	params: Iterable of parameters to optimize
	lr: Learning rate (default: 1e-3)
	momentum: Momentum factor (default: 0.95)
	nesterov: Enable Nesterov momentum (default: False)
	backend: Backend for orthogonalization ('newtonschulz' or 'svd')
	update_period: Period for updating orthogonalization (default: 1)
	rank_deficiency_threshold: Threshold for rank deficiency detection
	eps: Small constant for numerical stability (default: 1e-8)
	weight_decay: Weight decay coefficient (default: 0.0)
	"""

	def __init__(
	self,
	params,
	lr: float = 1e-3,
	momentum: float = 0.95,
	nesterov: bool = False,
	backend: str = "newtonschulz",
	update_period: int = 1,
	rank_deficiency_threshold: float = 1e-6,
	eps: float = 1e-8,
	weight_decay: float = 0.0,
	):
	if not 0.0 <= lr:
	raise ValueError(f"Invalid learning rate: {lr}")
	if not 0.0 <= momentum <= 1.0:
	raise ValueError(f"Invalid momentum value: {momentum}")
	if not 0.0 <= weight_decay:
	raise ValueError(f"Invalid weight_decay value: {weight_decay}")
	if backend not in ["newtonschulz", "svd"]:
	raise ValueError(f"Invalid backend: {backend}")

	defaults = dict(
	lr=lr,
	momentum=momentum,
	nesterov=nesterov,
	backend=backend,
	update_period=update_period,
	rank_deficiency_threshold=rank_deficiency_threshold,
	eps=eps,
	weight_decay=weight_decay,
	)
	super().__init__(params, defaults)

	def _orthogonalize_newtonschulz(self, matrix: torch.Tensor, num_iterations: int = 5) -> torch.Tensor:
	"""Orthogonalize matrix using Newton-Schulz iteration."""
	# Handle different shapes
	original_shape = matrix.shape
	if matrix.dim() > 2:
	matrix = matrix.view(-1, matrix.shape[-1])

	if matrix.shape[0] >= matrix.shape[1]:
	# Tall matrix - orthogonalize columns
	X = matrix.clone()
	for _ in range(num_iterations):
	A = X.T @ X
	X = X @ (1.5 * torch.eye(A.shape[0], device=A.device, dtype=A.dtype) - 0.5 * A)
	else:
	# Wide matrix - orthogonalize rows
	X = matrix.clone()
	for _ in range(num_iterations):
	A = X @ X.T
	X = (1.5 * torch.eye(A.shape[0], device=A.device, dtype=A.dtype) - 0.5 * A) @ X

	return X.view(original_shape)

	def _orthogonalize_svd(self, matrix: torch.Tensor) -> torch.Tensor:
	"""Orthogonalize matrix using SVD decomposition."""
	original_shape = matrix.shape
	if matrix.dim() > 2:
	matrix = matrix.view(-1, matrix.shape[-1])

	try:
	U, _, Vt = torch.linalg.svd(matrix, full_matrices=False)
	orthogonal = U @ Vt
	return orthogonal.view(original_shape)
	except torch._C._LinAlgError:
	# Fallback to Newton-Schulz if SVD fails
	warnings.warn("SVD failed, falling back to Newton-Schulz")
	return self._orthogonalize_newtonschulz(matrix)

	@torch.no_grad()
	def step(self, closure=None):
	"""Perform a single optimization step."""
	loss = None
	if closure is not None:
	with torch.enable_grad():
	loss = closure()

	for group in self.param_groups:
	for p in group["params"]:
	if p.grad is None:
	continue

	grad = p.grad
	if grad.dtype in {torch.float16, torch.bfloat16}:
	grad = grad.float()

	state = self.state[p]

	# State initialization
	if len(state) == 0:
	state["step"] = 0
	state["momentum_buffer"] = torch.zeros_like(p, memory_format=torch.preserve_format)

	momentum_buffer = state["momentum_buffer"]
	state["step"] += 1

	# Weight decay
	if group["weight_decay"] != 0:
	grad = grad.add(p, alpha=group["weight_decay"])

	# Apply momentum
	momentum_buffer.mul_(group["momentum"]).add_(grad)

	# Orthogonalize momentum every update_period steps
	if state["step"] % group["update_period"] == 0 and momentum_buffer.numel() > 1:
	# Only orthogonalize if we have sufficient dimensions
	if momentum_buffer.dim() >= 2 and min(momentum_buffer.shape[-2:]) > 1:
	if group["backend"] == "newtonschulz":
	orthogonal_momentum = self._orthogonalize_newtonschulz(momentum_buffer)
	else:
	orthogonal_momentum = self._orthogonalize_svd(momentum_buffer)

	# Check for rank deficiency
	rank_ratio = torch.linalg.matrix_norm(orthogonal_momentum) / torch.linalg.matrix_norm(momentum_buffer)
	if rank_ratio < group["rank_deficiency_threshold"]:
	warnings.warn("Detected rank deficiency in momentum buffer")
	else:
	momentum_buffer.copy_(orthogonal_momentum)

	# Apply Nesterov acceleration if enabled
	if group["nesterov"]:
	update = grad.add(momentum_buffer, alpha=group["momentum"])
	else:
	update = momentum_buffer

	# Apply update
	p.add_(update, alpha=-group["lr"])

	return loss


	def configure_muon_optimizer(
	model: torch.nn.Module,
	lr: float = 1e-3,
	momentum: float = 0.95,
	weight_decay: float = 0.01,
	total_steps: Optional[int] = None,
	warmup_ratio: float = 0.1,
	nesterov: bool = False,
	backend: str = "newtonschulz",
	**muon_kwargs
	) -> Tuple[Muon, Optional[torch.optim.lr_scheduler._LRScheduler]]:
	"""
	Configure Muon optimizer with OneCycle learning rate schedule.

	This function provides a drop-in replacement for BitTransformerLM's
	configure_optimizer function, using Muon instead of AdamW.

	Args:
	model: PyTorch model to optimize
	lr: Peak learning rate
	momentum: Momentum factor for Muon
	weight_decay: Weight decay coefficient
	total_steps: Total training steps for OneCycle schedule
	warmup_ratio: Fraction of steps for warmup
	nesterov: Enable Nesterov momentum
	backend: Orthogonalization backend
	**muon_kwargs: Additional arguments for Muon optimizer

	Returns:
	Tuple of (optimizer, scheduler)
	"""
	# Filter parameters that need weight decay
	decay_params = []
	no_decay_params = []

	for name, param in model.named_parameters():
	if not param.requires_grad:
	continue
	# Apply weight decay to weights but not biases/norms
	if param.dim() >= 2:
	decay_params.append(param)
	else:
	no_decay_params.append(param)

	param_groups = [
	{"params": decay_params, "weight_decay": weight_decay},
	{"params": no_decay_params, "weight_decay": 0.0},
	]

	optimizer = Muon(
	param_groups,
	lr=lr,
	momentum=momentum,
	nesterov=nesterov,
	backend=backend,
	**muon_kwargs
	)

	scheduler = None
	if total_steps is not None and total_steps > 0:
	scheduler = torch.optim.lr_scheduler.OneCycleLR(
	optimizer,
	max_lr=lr,
	total_steps=total_steps,
	pct_start=warmup_ratio,
	anneal_strategy='cos',
	cycle_momentum=False, # Muon handles momentum internally
	div_factor=25.0,
	final_div_factor=1e4,
	)

	return optimizer, scheduler


	def create_muon_training_config(
	lr: float = 1e-3,
	momentum: float = 0.95,
	weight_decay: float = 0.01,
	backend: str = "newtonschulz",
	nesterov: bool = False,
	**kwargs
	) -> Dict[str, Any]:
	"""
	Create a training configuration dictionary for Muon optimizer.

	This can be used with BitTransformerLM's training scripts by passing
	the config to the training loop.

	Args:
	lr: Learning rate
	momentum: Momentum factor
	weight_decay: Weight decay coefficient
	backend: Orthogonalization backend
	nesterov: Enable Nesterov momentum
	**kwargs: Additional configuration options

	Returns:
	Dictionary containing training configuration
	"""
	config = {
	"optimizer_type": "muon",
	"optimizer_config": {
	"lr": lr,
	"momentum": momentum,
	"weight_decay": weight_decay,
	"backend": backend,
	"nesterov": nesterov,
	**kwargs
	},
	"scheduler_type": "onecycle",
	}

	return config


	# Example usage and integration helpers
	def integrate_with_bittransformerlm():
	"""
	Example of how to integrate Muon optimizer with BitTransformerLM training.

	Usage:
	from BTLM_Extensions.muon_optimizer import configure_muon_optimizer

	# Replace the standard optimizer configuration
	optimizer, scheduler = configure_muon_optimizer(
	model, lr=1e-3, momentum=0.95, total_steps=1000
	)

	# Use in training loop
	train_loop(model, data, optimizer=optimizer, scheduler=scheduler)
	"""
	pass


	if __name__ == "__main__":
	# Simple test of the optimizer
	import torch.nn as nn

	model = nn.Sequential(
	nn.Linear(10, 20),
	nn.ReLU(),
	nn.Linear(20, 1)
	)

	optimizer, scheduler = configure_muon_optimizer(model, lr=1e-3, total_steps=100)

	# Simple training step
	x = torch.randn(32, 10)
	y = torch.randn(32, 1)

	pred = model(x)
	loss = nn.functional.mse_loss(pred, y)
	loss.backward()

	optimizer.step()
	if scheduler:
	scheduler.step()

	print("Muon optimizer test completed successfully!")
	print(f"Loss: {loss.item():.4f}")