Motif-Technologies
/

optimizer

Kernels

Model card Files Files and versions

xet

Community

dongseokmotif commited on 18 days ago

Commit

aff01db

1 Parent(s): abaa449

Apply pre-commit formatting (yapf, isort) [skip-build]

Browse files

Files changed (1) hide show

torch-ext/optimizer/newton_schulz.py +26 -15

torch-ext/optimizer/newton_schulz.py CHANGED Viewed

@@ -1,13 +1,15 @@
-import torch
 from itertools import repeat
 from math import inf, sqrt
 import numpy as np
 from .matmul_transpose_triton import matmul_transpose_assign
 COMM_DTYPE = torch.bfloat16
 DEFAULT_CHUNK_SIZE_RATIO = 4
 def _optimal_quintic(l, u):
     """
     Use the simplified Remez algorithm to find the optimal odd quintic approximant
@@ -20,9 +22,9 @@ def _optimal_quintic(l, u):
     """
     assert 0 <= l <= u
     if 1 - 5e-6 <= l / u:
-        return (15/8)/u, (-10/8)/(u**3), (3/8)/(u**5)
-    q = (3*l + u) / 4
-    r = (l + 3*u) / 4
     E, old_E = inf, None
     while not old_E or abs(old_E - E) > 1e-15:
         old_E = E
@@ -33,8 +35,9 @@ def _optimal_quintic(l, u):
             [u, u**3, u**5, -1],
         ])
         a, b, c, E = np.linalg.solve(LHS, np.ones(4))
-        q, r = np.sqrt((-3*b + np.array([-1, 1]) *
-                        sqrt(9*b**2 - 20*a*c)) / (10*c))
     return float(a), float(b), float(c)
@@ -63,16 +66,20 @@ def _optimal_composition(l, num_iters, safety_factor_eps=0, cushion=0):
     safety_factor = 1 + safety_factor_eps
     coefficients = []
     for iter in range(num_iters):
-        a, b, c = _optimal_quintic(max(l, cushion*u), u)
-        if cushion*u > l:
-            pl = a*l + b*l**3 + c*l**5
-            pu = a*u + b*u**3 + c*u**5
-            rescaler = 2/(pl + pu)
-            a *= rescaler; b *= rescaler; c *= rescaler
         if iter < num_iters - 1:
-            a /= safety_factor; b /= safety_factor**3; c /= safety_factor**5
         coefficients.append((a, b, c))
-        l = a*l + b*l**3 + c*l**5
         u = 2 - l
     return coefficients
@@ -89,7 +96,11 @@ def _optimal_composition(l, num_iters, safety_factor_eps=0, cushion=0):
 #   - Polar Express: analytically optimal per step, adapting to the shrinking
 #     singular-value interval [l, u] as iterations progress; converges all
 #     singular values to 1, producing the exact polar factor UV^T.
-_coeffs_list = _optimal_composition(l=1e-3, num_iters=10, safety_factor_eps=1e-2, cushion=0.02)
 # This code is adapted from:
 #   KellerJordan/Muon (https://github.com/KellerJordan/Muon/blob/master/muon.py)

 from itertools import repeat
 from math import inf, sqrt
 import numpy as np
+import torch
 from .matmul_transpose_triton import matmul_transpose_assign
 COMM_DTYPE = torch.bfloat16
 DEFAULT_CHUNK_SIZE_RATIO = 4
 def _optimal_quintic(l, u):
     """
     Use the simplified Remez algorithm to find the optimal odd quintic approximant
     """
     assert 0 <= l <= u
     if 1 - 5e-6 <= l / u:
+        return (15 / 8) / u, (-10 / 8) / (u**3), (3 / 8) / (u**5)
+    q = (3 * l + u) / 4
+    r = (l + 3 * u) / 4
     E, old_E = inf, None
     while not old_E or abs(old_E - E) > 1e-15:
         old_E = E
             [u, u**3, u**5, -1],
         ])
         a, b, c, E = np.linalg.solve(LHS, np.ones(4))
+        q, r = np.sqrt(
+            (-3 * b + np.array([-1, 1]) * sqrt(9 * b**2 - 20 * a * c)) /
+            (10 * c))
     return float(a), float(b), float(c)
     safety_factor = 1 + safety_factor_eps
     coefficients = []
     for iter in range(num_iters):
+        a, b, c = _optimal_quintic(max(l, cushion * u), u)
+        if cushion * u > l:
+            pl = a * l + b * l**3 + c * l**5
+            pu = a * u + b * u**3 + c * u**5
+            rescaler = 2 / (pl + pu)
+            a *= rescaler
+            b *= rescaler
+            c *= rescaler
         if iter < num_iters - 1:
+            a /= safety_factor
+            b /= safety_factor**3
+            c /= safety_factor**5
         coefficients.append((a, b, c))
+        l = a * l + b * l**3 + c * l**5
         u = 2 - l
     return coefficients
 #   - Polar Express: analytically optimal per step, adapting to the shrinking
 #     singular-value interval [l, u] as iterations progress; converges all
 #     singular values to 1, producing the exact polar factor UV^T.
+_coeffs_list = _optimal_composition(l=1e-3,
+                                    num_iters=10,
+                                    safety_factor_eps=1e-2,
+                                    cushion=0.02)
 # This code is adapted from:
 #   KellerJordan/Muon (https://github.com/KellerJordan/Muon/blob/master/muon.py)