IvmeLabs
/

Ivme-Conversate-22M-Base

+"""
+Muon optimizer for İvme — MomentUm Orthogonalized by Newton-schulz.
+Muon orthogonalizes each 2D weight's momentum-smoothed gradient via a quintic
+Newton-Schulz iteration before the update. It consistently beats AdamW on
+transformer bodies, especially on reasoning-heavy benchmarks, at negligible
+extra cost.
+Standard practice (and what we use here): Muon for the 2D transformer matrices,
+AdamW for everything else — embeddings, the LM head, and all 1D params (norms).
+Since İvme ties its embeddings, the shared embed/head table goes to AdamW.
+Reference: Keller Jordan's Muon (github.com/KellerJordan/Muon).
+"""
+from __future__ import annotations
+import torch
+from torch.optim import AdamW
+# --------------------------------------------------------------------------- #
+# Newton-Schulz orthogonalization
+# --------------------------------------------------------------------------- #
+@torch.no_grad()
+def zeropower_via_newtonschulz5(G: torch.Tensor, steps: int = 5) -> torch.Tensor:
+    """Compute an approximate orthogonalization of G via a quintic NS iteration.
+    The coefficients (a, b, c) are tuned so the iteration pushes the singular
+    values of G toward 1 without ever computing an SVD. Runs in bf16 for speed.
+    """
+    assert G.ndim == 2
+    a, b, c = (3.4445, -4.7750, 2.0315)
+    X = G.bfloat16()
+    transposed = G.size(0) > G.size(1)
+    if transposed:
+        X = X.T
+    # Normalize so the spectral norm is <= 1 before iterating.
+    X = X / (X.norm() + 1e-7)
+    for _ in range(steps):
+        A = X @ X.T
+        B = b * A + c * (A @ A)
+        X = a * X + B @ X
+    if transposed:
+        X = X.T
+    return X
+# --------------------------------------------------------------------------- #
+# Muon
+# --------------------------------------------------------------------------- #
+class Muon(torch.optim.Optimizer):
+    def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, ns_steps=5):
+        defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, ns_steps=ns_steps)
+        super().__init__(params, defaults)
+    @torch.no_grad()
+    def step(self):
+        for group in self.param_groups:
+            lr = group["lr"]
+            momentum = group["momentum"]
+            nesterov = group["nesterov"]
+            ns_steps = group["ns_steps"]
+            for p in group["params"]:
+                if p.grad is None:
+                    continue
+                g = p.grad
+                state = self.state[p]
+                if "momentum_buffer" not in state:
+                    state["momentum_buffer"] = torch.zeros_like(g)
+                buf = state["momentum_buffer"]
+                buf.mul_(momentum).add_(g)
+                g = g.add(buf, alpha=momentum) if nesterov else buf
+                g = zeropower_via_newtonschulz5(g, steps=ns_steps)
+                # Scale so the update RMS roughly matches the parameter shape;
+                # an orthogonalized matrix has spectral norm ~1 regardless of size.
+                scale = max(1.0, g.size(0) / g.size(1)) ** 0.5
+                p.add_(g.to(p.dtype), alpha=-lr * scale)
+# --------------------------------------------------------------------------- #
+# Hybrid optimizer builder
+# --------------------------------------------------------------------------- #
+def build_optimizers(model, muon_lr=0.02, adamw_lr=3e-4, weight_decay=0.1,
+                     betas=(0.9, 0.95)):
+    """Split İvme's params into Muon (2D transformer matrices) and AdamW (rest).
+    Returns (muon, adamw). Step both each iteration; schedule both together.
+    """
+    muon_params, adamw_params = [], []
+    for name, p in model.named_parameters():
+        if not p.requires_grad:
+            continue
+        # 2D weights inside transformer blocks -> Muon.
+        # Embeddings, LM head, and all 1D params (norms) -> AdamW.
+        is_body_matrix = (
+            p.ndim == 2
+            and "embed" not in name
+            and "lm_head" not in name
+        )
+        (muon_params if is_body_matrix else adamw_params).append(p)
+    muon = Muon(muon_params, lr=muon_lr)
+    adamw = AdamW(adamw_params, lr=adamw_lr, betas=betas, weight_decay=weight_decay)
+    n_muon = sum(p.numel() for p in muon_params)
+    n_adamw = sum(p.numel() for p in adamw_params)
+    print(f"[optim] Muon  : {len(muon_params)} tensors, {n_muon:,} params")
+    print(f"[optim] AdamW : {len(adamw_params)} tensors, {n_adamw:,} params")
+    return muon, adamw
+# --------------------------------------------------------------------------- #
+# WSD learning-rate schedule
+# --------------------------------------------------------------------------- #
+def wsd_lr_multiplier(step: int, total_steps: int, warmup: int = 100,
+                      decay_frac: float = 0.2) -> float:
+    """Warmup-Stable-Decay multiplier in [0, 1].
+    Linear warmup -> constant stable phase -> linear decay to ~0 over the final
+    `decay_frac` of training. Multiply each optimizer's base lr by this value.
+    """
+    decay_start = int(total_steps * (1 - decay_frac))
+    if step < warmup:
+        return step / max(1, warmup)
+    if step < decay_start:
+        return 1.0
+    # Linear decay over the final decay_frac of steps.
+    progress = (step - decay_start) / max(1, total_steps - decay_start)
+    return max(0.0, 1.0 - progress)
+# --------------------------------------------------------------------------- #
+# Self-test
+# --------------------------------------------------------------------------- #
+if __name__ == "__main__":
+    # 1) Newton-Schulz should produce a near-orthogonal matrix.
+    torch.manual_seed(0)
+    G = torch.randn(384, 1024)
+    Q = zeropower_via_newtonschulz5(G).float()
+    # For a wide matrix, Q @ Q.T should be close to identity.
+    I = Q @ Q.T
+    err = (I - torch.eye(Q.size(0))).abs().mean().item()
+    print(f"[ns] orthogonality error (lower=better): {err:.4f}")
+    # 2) WSD schedule shape.
+    total = 6000
+    pts = [0, 50, 100, 1000, 4800, 5400, 5999]
+    print("[wsd] step -> lr_mult")
+    for s in pts:
+        print(f"      {s:>5} -> {wsd_lr_multiplier(s, total):.3f}")