AbstractPhil
/

eigh-triton

Model card Files Files and versions

xet

Community

AbstractPhil commited on 16 days ago

Commit

0e9f66f

verified ·

1 Parent(s): 8de8161

Create eigh_readme_version_tester.py

Browse files

Files changed (1) hide show

eigh_readme_version_tester.py +281 -0

eigh_readme_version_tester.py ADDED Viewed

	@@ -0,0 +1,281 @@

+"""
+fl_eigh.py — Hybrid optimal eigendecomposition.
+Combines FL algebraic superiority with geometric refinement:
+  Phase 1: FL characteristic polynomial (fp64) → algebraically exact coefficients
+  Phase 2: Laguerre + Newton polish → algebraically optimal eigenvalues
+  Phase 3: FL adjugate (fp64 Horner + max-col) → eigenvector extraction
+  Phase 4: Newton-Schulz → orthonormal eigenvectors (geometric projection)
+  Phase 5: Rayleigh quotient → λᵢ = vᵢᵀAvᵢ (geometrically optimal eigenvalues)
+The Rayleigh quotient is the KEY insight: given orthonormal eigenvectors V,
+the eigenvalues λᵢ = vᵢᵀAvᵢ minimize ||Av - λv||² — the eigenpair residual.
+This fuses FL's algebraic precision with geometric optimality.
+Result: eigenvalues that minimize residual + eigenvectors that are orthonormal.
+Both reconstruction ||A - VΛVᵀ|| and eigenpair ||Av - λv|| are optimal.
+Author: AbstractPhil / GeoLIP project
+"""
+import math, time, gc, sys
+import torch
+import torch.nn as nn
+from torch import Tensor
+from typing import Tuple
+torch.backends.cuda.matmul.allow_tf32 = False
+torch.backends.cudnn.allow_tf32 = False
+torch.set_float32_matmul_precision('highest')
+class FLEigh(nn.Module):
+    def forward(self, A: Tensor) -> Tuple[Tensor, Tensor]:
+        B, n, _ = A.shape
+        device = A.device
+        # ── Pre-scale ──
+        scale = (torch.linalg.norm(A.reshape(B, -1), dim=-1) / math.sqrt(n)).clamp(min=1e-12)
+        As = A / scale[:, None, None]
+        # ══════ Phase 1: Faddeev-LeVerrier (fp64) ══════
+        # n bmm → characteristic polynomial + adjugate basis
+        Ad = As.double()
+        eye_d = torch.eye(n, device=device, dtype=torch.float64).unsqueeze(0).expand(B, -1, -1)
+        c = torch.zeros(B, n + 1, device=device, dtype=torch.float64)
+        c[:, n] = 1.0
+        Mstore = torch.zeros(n + 1, B, n, n, device=device, dtype=torch.float64)
+        Mk = torch.zeros(B, n, n, device=device, dtype=torch.float64)
+        for k in range(1, n + 1):
+            Mk = torch.bmm(Ad, Mk) + c[:, n - k + 1, None, None] * eye_d
+            Mstore[k] = Mk
+            c[:, n - k] = -(Ad * Mk).sum((-2, -1)) / k
+        # ══════ Phase 2: Laguerre + Polish → algebraic eigenvalues ══════
+        use_f64 = n > 6
+        dt = torch.float64 if use_f64 else torch.float32
+        cl = c.to(dt).clone()
+        roots = torch.zeros(B, n, device=device, dtype=dt)
+        zi = As.to(dt).diagonal(dim1=-2, dim2=-1).sort(dim=-1).values
+        zi = zi + torch.linspace(-1e-4, 1e-4, n, device=device, dtype=dt).unsqueeze(0)
+        for ri in range(n):
+            deg = n - ri
+            z = zi[:, ri]
+            for _ in range(5):
+                pv = cl[:, deg]; dp = torch.zeros(B, device=device, dtype=dt)
+                d2 = torch.zeros(B, device=device, dtype=dt)
+                for j in range(deg - 1, -1, -1):
+                    d2 = d2 * z + dp; dp = dp * z + pv; pv = pv * z + cl[:, j]
+                ok = pv.abs() > 1e-30
+                ps = torch.where(ok, pv, torch.ones_like(pv))
+                G = torch.where(ok, dp / ps, torch.zeros_like(dp))
+                H = G * G - torch.where(ok, 2.0 * d2 / ps, torch.zeros_like(d2))
+                disc = ((deg - 1.0) * (deg * H - G * G)).clamp(min=0.0)
+                sq = torch.sqrt(disc); gp = G + sq; gm = G - sq
+                den = torch.where(gp.abs() >= gm.abs(), gp, gm)
+                dok = den.abs() > 1e-20
+                ds = torch.where(dok, den, torch.ones_like(den))
+                z = z - torch.where(dok, float(deg) / ds, torch.zeros_like(den))
+            roots[:, ri] = z
+            b = cl[:, deg]
+            for j in range(deg - 1, 0, -1):
+                bn = cl[:, j] + z * b; cl[:, j] = b; b = bn
+            cl[:, 0] = b
+        # Newton polish on original polynomial (fp64)
+        roots = roots.double()
+        for _ in range(3):
+            pv = torch.ones(B, n, device=device, dtype=torch.float64)
+            dp = torch.zeros(B, n, device=device, dtype=torch.float64)
+            for j in range(n - 1, -1, -1):
+                dp = dp * roots + pv; pv = pv * roots + c[:, j:j + 1]
+            ok = dp.abs() > 1e-30
+            dps = torch.where(ok, dp, torch.ones_like(dp))
+            roots = roots - torch.where(ok, pv / dps, torch.zeros_like(pv))
+        # ══════ Phase 3: FL adjugate → eigenvector extraction (fp64) ══════
+        # Horner evaluation of adj(λI-A) at each eigenvalue
+        lam = roots  # [B, n] fp64
+        R = Mstore[1].unsqueeze(1).expand(-1, n, -1, -1).clone()
+        for k in range(2, n + 1):
+            R = R * lam[:, :, None, None] + Mstore[k].unsqueeze(1)
+        # Max-norm column extraction (robust for all n)
+        cnorms = R.norm(dim=-2)                      # [B, n_eig, n_mat]
+        best = cnorms.argmax(dim=-1)                 # [B, n_eig]
+        idx = best.unsqueeze(-1).unsqueeze(-1).expand(-1, -1, n, 1)
+        vec = R.gather(-1, idx).squeeze(-1)          # [B, n_eig, n_mat]
+        vec = vec / (vec.norm(dim=-1, keepdim=True) + 1e-30)
+        V = vec.float().transpose(-2, -1)            # [B, n, n] columns = eigvecs
+        # ══════ Phase 4: Newton-Schulz → orthonormal eigenvectors ══════
+        # 2 iterations: stable for all n (3 diverges on near-degenerate cases)
+        eye_f = torch.eye(n, device=device, dtype=torch.float32).unsqueeze(0).expand(B, -1, -1)
+        Y = torch.bmm(V.transpose(-2, -1), V)
+        X = eye_f.clone()
+        for _ in range(2):
+            T = 3.0 * eye_f - Y
+            X = 0.5 * torch.bmm(X, T)
+            Y = 0.5 * torch.bmm(T, Y)
+        V = torch.bmm(V, X)
+        # ══════ Phase 5: Rayleigh quotient → geometrically optimal eigenvalues ══════
+        # λᵢ = vᵢᵀ A vᵢ — minimizes ||Av - λv||² for the given v
+        AV = torch.bmm(A, V)                         # [B, n, n]
+        evals = (V * AV).sum(dim=-2)                 # [B, n] = diag(VᵀAV)
+        # ── Sort ──
+        se, perm = evals.sort(dim=-1)
+        sv = V.gather(-1, perm.unsqueeze(-2).expand_as(V))
+        return se, sv
+# ═══════════════════════════════════════════════════════════════════════
+# Mathematical purity test
+# ═══════════════════════════════════════════════════════════════════════
+def math_test(A, vals, vecs):
+    B, n, _ = A.shape
+    dev = A.device
+    Ad = A.double(); vd = vals.double(); Vd = vecs.double()
+    AV = torch.bmm(Ad, Vd); VL = Vd * vd.unsqueeze(-2)
+    An = Ad.reshape(B, -1).norm(dim=-1, keepdim=True).clamp(min=1e-30)
+    res = (AV - VL).norm(dim=-2) / An  # per-eigvec residual
+    VtV = torch.bmm(Vd.mT, Vd)
+    I = torch.eye(n, device=dev, dtype=torch.float64).unsqueeze(0)
+    orth = (VtV - I).reshape(B, -1).norm(dim=-1)
+    recon = torch.bmm(Vd * vd.unsqueeze(-2), Vd.mT)
+    recon_err = (Ad - recon).reshape(B, -1).norm(dim=-1) / An.squeeze(-1)
+    tr_err = (Ad.diagonal(dim1=-2,dim2=-1).sum(-1) - vd.sum(-1)).abs()
+    det_A = torch.linalg.det(Ad)
+    det_err = (det_A - vd.prod(-1)).abs() / det_A.abs().clamp(min=1e-30)
+    cp = torch.zeros(B, n, device=dev, dtype=torch.float64)
+    for i in range(n):
+        cp[:, i] = torch.linalg.det(vd[:, i:i+1, None] * I - Ad).abs()
+    return dict(
+        res_max=res.max().item(), res_mean=res.mean().item(),
+        orth_max=orth.max().item(), orth_mean=orth.mean().item(),
+        recon_max=recon_err.max().item(), recon_mean=recon_err.mean().item(),
+        tr_max=tr_err.max().item(), tr_mean=tr_err.mean().item(),
+        det_max=det_err.max().item(), det_mean=det_err.mean().item(),
+        cp_max=cp.max().item(), cp_mean=cp.mean().item(),
+    )
+def sync(): torch.cuda.synchronize()
+def gt(fn, w=20, r=300):
+    for _ in range(w): fn()
+    sync(); t=time.perf_counter()
+    for _ in range(r): fn()
+    sync(); return (time.perf_counter()-t)/r
+def fmt(s):
+    if s<1e-3: return f"{s*1e6:.1f}µs"
+    if s<1: return f"{s*1e3:.2f}ms"
+    return f"{s:.3f}s"
+def main():
+    if not torch.cuda.is_available(): sys.exit(1)
+    dev = torch.device('cuda')
+    p = torch.cuda.get_device_properties(0)
+    print("="*72)
+    print("  FL Hybrid Eigh — Algebraic + Geometric Optimal")
+    print("="*72)
+    print(f"  {p.name} | PyTorch {torch.__version__}")
+    # ── Mathematical purity sweep ──
+    print("\n" + "="*72)
+    print("  MATHEMATICAL PURITY (no reference impl, only definitions)")
+    print("="*72)
+    for nx in [3, 5, 6, 8, 10, 12]:
+        B = 2048 if nx <= 8 else 1024
+        A = (lambda R:(R+R.mT)/2)(torch.randn(B, nx, nx, device=dev))
+        cv, cV = torch.linalg.eigh(A)
+        fv, fV = FLEigh()(A)
+        mc = math_test(A, cv, cV); mf = math_test(A, fv, fV)
+        wins_c = 0; wins_f = 0
+        for key in mc:
+            if mf[key] < mc[key]: wins_f += 1
+            elif mc[key] < mf[key]: wins_c += 1
+        print(f"\n  n={nx} B={B}: FL wins {wins_f}/12, cuSOLVER wins {wins_c}/12")
+        def row(name, key):
+            vc=mc[key]; vf=mf[key]
+            w="FL" if vf<vc else ("cuS" if vc<vf else "tie")
+            m="◄" if w=="FL" else ("►" if w=="cuS" else " ")
+            print(f"    {name:<28} {vc:>10.1e} {vf:>10.1e}  {w} {m}")
+        print(f"    {'Property':<28} {'cuSOLVER':>10} {'FL':>10}")
+        row("Eigenpair max", "res_max")
+        row("Eigenpair mean", "res_mean")
+        row("Orthogonality max", "orth_max")
+        row("Orthogonality mean", "orth_mean")
+        row("Reconstruction max", "recon_max")
+        row("Reconstruction mean", "recon_mean")
+        row("Trace max", "tr_max")
+        row("Determinant max", "det_max")
+        row("Char poly max", "cp_max")
+        row("Char poly mean", "cp_mean")
+        del A
+    # ── Accuracy pass/fail ──
+    print("\n" + "="*72)
+    print("  ACCURACY PASS/FAIL")
+    print("="*72)
+    ok_all = True
+    for nx in [3,4,5,6,8,10,12,16]:
+        A = (lambda R:(R+R.mT)/2)(torch.randn(1024, nx, nx, device=dev))
+        rv,rV = torch.linalg.eigh(A); fv,fV = FLEigh()(A)
+        ve = (fv-rv).abs().max().item()
+        dots = torch.bmm(rV.double().mT, fV.double()).abs().max(dim=-1).values.min().item()
+        ok = ve < 1e-2 and dots > 0.99
+        if not ok: ok_all = False
+        print(f"  [{'OK' if ok else 'NO'}] n={nx:>2} val_diff={ve:.1e} align={dots:.6f}")
+        del A
+    # ── Speed ──
+    N=6; B=4096
+    A = (lambda R:(R+R.mT)/2)(torch.randn(B, N, N, device=dev))
+    solver = FLEigh()
+    print(f"\n" + "="*72)
+    print(f"  THROUGHPUT (n={N} B={B})")
+    print("="*72)
+    for _ in range(5): solver(A); sync()
+    tr = gt(lambda: torch.linalg.eigh(A))
+    te = gt(lambda: solver(A))
+    print(f"  cuSOLVER: {fmt(tr)}")
+    print(f"  FL eager: {fmt(te)} ({tr/te:.2f}×)")
+    try:
+        cs = torch.compile(solver, fullgraph=True)
+        print("  Compiling...", end=" ", flush=True)
+        for _ in range(3): cs(A); sync()
+        print("done.")
+        tc = gt(lambda: cs(A))
+        print(f"  FL compiled: {fmt(tc)} ({tr/tc:.2f}×)")
+    except Exception as e:
+        print(f"  COMPILE FAILED: {str(e)[:100]}")
+        tc = None
+    # ── Memory ──
+    print(f"\n  MEMORY")
+    for l,fn in [("cuSOLVER",lambda:torch.linalg.eigh(A)),("FL",lambda:solver(A))]:
+        torch.cuda.empty_cache(); gc.collect(); torch.cuda.reset_peak_memory_stats()
+        b=torch.cuda.memory_allocated(); fn(); sync()
+        print(f"  {l:<10} {(torch.cuda.max_memory_allocated()-b)/1024**2:.1f}MB")
+    # ── Summary ──
+    print(f"\n" + "="*72)
+    print(f"  All pass: {ok_all}")
+    if tc: print(f"  Compiled: {tr/tc:.2f}× vs cuSOLVER")
+    print("="*72)
+if __name__ == '__main__':
+    main()