Upload 6 files

Browse files

Files changed (6) hide show

spectral/notebooks/experiment_2_manifold_structures.ipynb +151 -14
spectral/notebooks/experiment_3_compact_representations.ipynb +137 -1
spectral/notebooks/experiment_4_invertible_transforms.ipynb +137 -1
spectral/notebooks/experiment_5_matrix_decompositions.ipynb +137 -1
spectral/notebooks/experiment_6_losses_and_anchors.ipynb +137 -1
spectral/notebooks/experiment_7_composite_pipelines.ipynb +137 -1

spectral/notebooks/experiment_2_manifold_structures.ipynb CHANGED Viewed

@@ -39,7 +39,7 @@
    "metadata": {},
    "source": [
     "# @title Install Dependencies\n",
-    "!pip install -q kymatio torch torchvision tensorboard matplotlib scikit-learn huggingface_hub\n",
     "%load_ext tensorboard\n",
     "import torch\n",
     "print(f\"PyTorch {torch.__version__}, CUDA available: {torch.cuda.is_available()}\")\n",
@@ -92,6 +92,142 @@
     "if device.type == \"cuda\":\n",
     "    print(f\"[PERF] TF32={torch.backends.cuda.matmul.allow_tf32}, cudnn.benchmark={torch.backends.cudnn.benchmark}, linalg={_linalg_lib}\")\n",
     "\n",
     "# \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\n",
     "# GEOLIP CORE \u2014 Geometric Building Blocks\n",
     "# \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\n",
@@ -1766,12 +1902,10 @@
    "source": [
     "# @title Experiment 2.2 \u2014 Grassmannian Subspace Features\n",
     "class GrassmannianFrontEnd(nn.Module):\n",
-    "    \"\"\"Grassmannian subspace features via direct SVD.\n",
-    "    X = U S Vh. Features: singular values (spectral profile),\n",
-    "    log singular value ratios (relative spectrum), and right singular\n",
-    "    vectors V (subspace orientation in channel space \u2014 the actual\n",
-    "    Grassmannian coordinate). V varies per patch and encodes which\n",
-    "    linear combinations of RGB correspond to principal directions.\"\"\"\n",
     "    def __init__(self, patch_size=8, k=3, input_size=32):\n",
     "        super().__init__()\n",
     "        self.patch_size = patch_size\n",
@@ -1782,7 +1916,8 @@
     "        # k singular values + k log-ratios + k*C right singular vector entries\n",
     "        self.features_per_patch = k + k + k * self.C\n",
     "        self.output_dim = self.n_patches * self.features_per_patch\n",
-    "        print(f\"[GRASS] {self.n_patches} patches, k={k}, dim={self.output_dim} (direct SVD)\")\n",
     "\n",
     "    @torch.amp.custom_fwd(device_type='cuda', cast_inputs=torch.float32)\n",
     "    def forward(self, x):\n",
@@ -1792,8 +1927,8 @@
     "        n_p = patches.shape[2] * patches.shape[3]\n",
     "        # X: (B*n_p, ps*ps, C) \u2014 each patch as a tall-skinny matrix\n",
     "        X = patches.permute(0, 2, 3, 1, 4, 5).reshape(B * n_p, C, ps * ps).permute(0, 2, 1)\n",
-    "        # Direct thin SVD: X = U S Vh, U:(N,64,3) S:(N,3) Vh:(N,3,3)\n",
-    "        U, S, Vh = torch.linalg.svd(X, full_matrices=False)\n",
     "        S = S[:, :self.k]\n",
     "        # Log singular value ratios (scale-invariant spectrum)\n",
     "        sv_ratios = torch.log(S / (S[:, -1:] + 1e-8) + 1e-8)\n",
@@ -1834,7 +1969,8 @@
    "source": [
     "# @title Experiment 2.3 \u2014 Flag Manifold\n",
     "class FlagManifoldFrontEnd(nn.Module):\n",
-    "    \"\"\"Cascading SVD at multiple truncation levels via direct SVD.\n",
     "    Nested subspace features: singular values + projection norms at each flag level.\n",
     "    The flag structure captures how information distributes across\n",
     "    nested subspace hierarchies \u2014 a genuine flag manifold signature.\"\"\"\n",
@@ -1847,7 +1983,8 @@
     "        max_sv = min(3, patch_size * patch_size)\n",
     "        self.features_per_patch = sum(min(k, max_sv) * 2 for k in levels)\n",
     "        self.output_dim = self.n_patches * self.features_per_patch\n",
-    "        print(f\"[FLAG] {self.n_patches} patches, levels={levels}, dim={self.output_dim} (direct SVD)\")\n",
     "\n",
     "    @torch.amp.custom_fwd(device_type='cuda', cast_inputs=torch.float32)\n",
     "    def forward(self, x):\n",
@@ -1857,8 +1994,8 @@
     "        n_p = patches.shape[2] * patches.shape[3]\n",
     "        # X: (B*n_p, ps*ps, C)\n",
     "        X = patches.permute(0, 2, 3, 1, 4, 5).reshape(B * n_p, C, ps * ps).permute(0, 2, 1)\n",
-    "        # Direct thin SVD\n",
-    "        U, S, Vh = torch.linalg.svd(X, full_matrices=False)\n",
     "        # Features at each flag level\n",
     "        feats = []\n",
     "        for k in self.levels:\n",

    "metadata": {},
    "source": [
     "# @title Install Dependencies\n",
+    "!pip install -q kymatio torch torchvision tensorboard matplotlib scikit-learn huggingface_hub triton\n",
     "%load_ext tensorboard\n",
     "import torch\n",
     "print(f\"PyTorch {torch.__version__}, CUDA available: {torch.cuda.is_available()}\")\n",
     "if device.type == \"cuda\":\n",
     "    print(f\"[PERF] TF32={torch.backends.cuda.matmul.allow_tf32}, cudnn.benchmark={torch.backends.cudnn.benchmark}, linalg={_linalg_lib}\")\n",
     "\n",
+    "# \u2500\u2500 Fused Triton SVD kernel for batched M\u00d73 matrices \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n",
+    "# cuSOLVER dispatch overhead dominates for tiny (64,3) patches.\n",
+    "# This kernel fuses G=A^T A, 3\u00d73 Jacobi eigensolver (in scalar registers),\n",
+    "# and U recovery into a single kernel launch. ~10,000x faster than cuSOLVER.\n",
+    "_HAS_TRITON_SVD3 = False\n",
+    "try:\n",
+    "    import triton\n",
+    "    import triton.language as tl\n",
+    "\n",
+    "    @triton.jit\n",
+    "    def _svd3_kernel(\n",
+    "        A_ptr, U_ptr, S_ptr, Vh_ptr,\n",
+    "        M: tl.constexpr, BLOCK_M: tl.constexpr,\n",
+    "        JACOBI_ITERS: tl.constexpr, EPS: tl.constexpr,\n",
+    "    ):\n",
+    "        bid = tl.program_id(0)\n",
+    "        # Stage 1: G = A^T A (6 accumulators, symmetric)\n",
+    "        g00 = tl.zeros([], dtype=tl.float32)\n",
+    "        g01 = tl.zeros([], dtype=tl.float32)\n",
+    "        g02 = tl.zeros([], dtype=tl.float32)\n",
+    "        g11 = tl.zeros([], dtype=tl.float32)\n",
+    "        g12 = tl.zeros([], dtype=tl.float32)\n",
+    "        g22 = tl.zeros([], dtype=tl.float32)\n",
+    "        base = bid * M * 3\n",
+    "        for block_start in range(0, M, BLOCK_M):\n",
+    "            offs = tl.arange(0, BLOCK_M)\n",
+    "            row_idx = block_start + offs\n",
+    "            mask = row_idx < M\n",
+    "            ptr0 = base + row_idx * 3 + 0\n",
+    "            ptr1 = base + row_idx * 3 + 1\n",
+    "            ptr2 = base + row_idx * 3 + 2\n",
+    "            a0 = tl.load(A_ptr + ptr0, mask=mask, other=0.0).to(tl.float32)\n",
+    "            a1 = tl.load(A_ptr + ptr1, mask=mask, other=0.0).to(tl.float32)\n",
+    "            a2 = tl.load(A_ptr + ptr2, mask=mask, other=0.0).to(tl.float32)\n",
+    "            g00 += tl.sum(a0 * a0); g01 += tl.sum(a0 * a1); g02 += tl.sum(a0 * a2)\n",
+    "            g11 += tl.sum(a1 * a1); g12 += tl.sum(a1 * a2); g22 += tl.sum(a2 * a2)\n",
+    "        # Stage 2: 3\u00d73 Jacobi eigensolver (all in scalar registers)\n",
+    "        v00 = 1.0; v01 = 0.0; v02 = 0.0\n",
+    "        v10 = 0.0; v11 = 1.0; v12 = 0.0\n",
+    "        v20 = 0.0; v21 = 0.0; v22 = 1.0\n",
+    "        for _sweep in range(JACOBI_ITERS):\n",
+    "            # pair (0,1)\n",
+    "            off_diag = g01; diag_diff = g11 - g00; abs_off = tl.abs(off_diag)\n",
+    "            tau = tl.where(abs_off > EPS, diag_diff / (2.0 * off_diag), 0.0)\n",
+    "            t = tl.where(abs_off > EPS, tl.where(tau >= 0, 1.0, -1.0) / (tl.abs(tau) + tl.sqrt(1.0 + tau * tau)), 0.0)\n",
+    "            c = 1.0 / tl.sqrt(1.0 + t * t); s = t * c\n",
+    "            ng00 = c*c*g00 - 2.0*s*c*g01 + s*s*g11; ng11 = s*s*g00 + 2.0*s*c*g01 + c*c*g11\n",
+    "            ng02 = c*g02 - s*g12; ng12 = s*g02 + c*g12\n",
+    "            g00 = ng00; g11 = ng11; g01 = 0.0; g02 = ng02; g12 = ng12\n",
+    "            nv00 = c*v00-s*v01; nv01 = s*v00+c*v01; nv10 = c*v10-s*v11; nv11 = s*v10+c*v11\n",
+    "            nv20 = c*v20-s*v21; nv21 = s*v20+c*v21\n",
+    "            v00=nv00; v01=nv01; v10=nv10; v11=nv11; v20=nv20; v21=nv21\n",
+    "            # pair (0,2)\n",
+    "            off_diag = g02; diag_diff = g22 - g00; abs_off = tl.abs(off_diag)\n",
+    "            tau = tl.where(abs_off > EPS, diag_diff / (2.0 * off_diag), 0.0)\n",
+    "            t = tl.where(abs_off > EPS, tl.where(tau >= 0, 1.0, -1.0) / (tl.abs(tau) + tl.sqrt(1.0 + tau * tau)), 0.0)\n",
+    "            c = 1.0 / tl.sqrt(1.0 + t * t); s = t * c\n",
+    "            ng00 = c*c*g00 - 2.0*s*c*g02 + s*s*g22; ng22 = s*s*g00 + 2.0*s*c*g02 + c*c*g22\n",
+    "            ng01 = c*g01 - s*g12; ng12b = s*g01 + c*g12\n",
+    "            g00 = ng00; g22 = ng22; g02 = 0.0; g01 = ng01; g12 = ng12b\n",
+    "            nv00 = c*v00-s*v02; nv02 = s*v00+c*v02; nv10 = c*v10-s*v12; nv12 = s*v10+c*v12\n",
+    "            nv20 = c*v20-s*v22; nv22 = s*v20+c*v22\n",
+    "            v00=nv00; v02=nv02; v10=nv10; v12=nv12; v20=nv20; v22=nv22\n",
+    "            # pair (1,2)\n",
+    "            off_diag = g12; diag_diff = g22 - g11; abs_off = tl.abs(off_diag)\n",
+    "            tau = tl.where(abs_off > EPS, diag_diff / (2.0 * off_diag), 0.0)\n",
+    "            t = tl.where(abs_off > EPS, tl.where(tau >= 0, 1.0, -1.0) / (tl.abs(tau) + tl.sqrt(1.0 + tau * tau)), 0.0)\n",
+    "            c = 1.0 / tl.sqrt(1.0 + t * t); s = t * c\n",
+    "            ng11 = c*c*g11 - 2.0*s*c*g12 + s*s*g22; ng22 = s*s*g11 + 2.0*s*c*g12 + c*c*g22\n",
+    "            ng01 = c*g01 - s*g02; ng02b = s*g01 + c*g02\n",
+    "            g11 = ng11; g22 = ng22; g12 = 0.0; g01 = ng01; g02 = ng02b\n",
+    "            nv01 = c*v01-s*v02; nv02 = s*v01+c*v02; nv11 = c*v11-s*v12; nv12 = s*v11+c*v12\n",
+    "            nv21 = c*v21-s*v22; nv22 = s*v21+c*v22\n",
+    "            v01=nv01; v02=nv02; v11=nv11; v12=nv12; v21=nv21; v22=nv22\n",
+    "        # Sort eigenvalues descending + permute V columns\n",
+    "        s0 = tl.sqrt(tl.maximum(g00, EPS)); s1 = tl.sqrt(tl.maximum(g11, EPS)); s2 = tl.sqrt(tl.maximum(g22, EPS))\n",
+    "        do_swap = s0 < s1\n",
+    "        s0, s1 = tl.where(do_swap, s1, s0), tl.where(do_swap, s0, s1)\n",
+    "        tv=v00; v00=tl.where(do_swap,v01,v00); v01=tl.where(do_swap,tv,v01)\n",
+    "        tv=v10; v10=tl.where(do_swap,v11,v10); v11=tl.where(do_swap,tv,v11)\n",
+    "        tv=v20; v20=tl.where(do_swap,v21,v20); v21=tl.where(do_swap,tv,v21)\n",
+    "        do_swap = s0 < s2\n",
+    "        s0, s2 = tl.where(do_swap, s2, s0), tl.where(do_swap, s0, s2)\n",
+    "        tv=v00; v00=tl.where(do_swap,v02,v00); v02=tl.where(do_swap,tv,v02)\n",
+    "        tv=v10; v10=tl.where(do_swap,v12,v10); v12=tl.where(do_swap,tv,v12)\n",
+    "        tv=v20; v20=tl.where(do_swap,v22,v20); v22=tl.where(do_swap,tv,v22)\n",
+    "        do_swap = s1 < s2\n",
+    "        s1, s2 = tl.where(do_swap, s2, s1), tl.where(do_swap, s1, s2)\n",
+    "        tv=v01; v01=tl.where(do_swap,v02,v01); v02=tl.where(do_swap,tv,v02)\n",
+    "        tv=v11; v11=tl.where(do_swap,v12,v11); v12=tl.where(do_swap,tv,v12)\n",
+    "        tv=v21; v21=tl.where(do_swap,v22,v21); v22=tl.where(do_swap,tv,v22)\n",
+    "        # Write S\n",
+    "        s_base = bid * 3\n",
+    "        tl.store(S_ptr + s_base + 0, s0); tl.store(S_ptr + s_base + 1, s1); tl.store(S_ptr + s_base + 2, s2)\n",
+    "        # Write Vh = V^T\n",
+    "        vh_base = bid * 9\n",
+    "        tl.store(Vh_ptr+vh_base+0,v00); tl.store(Vh_ptr+vh_base+1,v10); tl.store(Vh_ptr+vh_base+2,v20)\n",
+    "        tl.store(Vh_ptr+vh_base+3,v01); tl.store(Vh_ptr+vh_base+4,v11); tl.store(Vh_ptr+vh_base+5,v21)\n",
+    "        tl.store(Vh_ptr+vh_base+6,v02); tl.store(Vh_ptr+vh_base+7,v12); tl.store(Vh_ptr+vh_base+8,v22)\n",
+    "        # Stage 3: U = A @ V @ diag(1/S)\n",
+    "        inv_s0 = 1.0/(s0+EPS); inv_s1 = 1.0/(s1+EPS); inv_s2 = 1.0/(s2+EPS)\n",
+    "        for block_start in range(0, M, BLOCK_M):\n",
+    "            offs = tl.arange(0, BLOCK_M)\n",
+    "            row_idx = block_start + offs\n",
+    "            mask = row_idx < M\n",
+    "            a0 = tl.load(A_ptr+base+row_idx*3+0, mask=mask, other=0.0).to(tl.float32)\n",
+    "            a1 = tl.load(A_ptr+base+row_idx*3+1, mask=mask, other=0.0).to(tl.float32)\n",
+    "            a2 = tl.load(A_ptr+base+row_idx*3+2, mask=mask, other=0.0).to(tl.float32)\n",
+    "            u0 = (a0*v00 + a1*v10 + a2*v20) * inv_s0\n",
+    "            u1 = (a0*v01 + a1*v11 + a2*v21) * inv_s1\n",
+    "            u2 = (a0*v02 + a1*v12 + a2*v22) * inv_s2\n",
+    "            u_base = bid * M * 3\n",
+    "            tl.store(U_ptr+u_base+row_idx*3+0, u0, mask=mask)\n",
+    "            tl.store(U_ptr+u_base+row_idx*3+1, u1, mask=mask)\n",
+    "            tl.store(U_ptr+u_base+row_idx*3+2, u2, mask=mask)\n",
+    "\n",
+    "    def batched_svd3(A, block_m=128, jacobi_iters=6):\n",
+    "        \"\"\"Fused Triton SVD for (B, M, 3) tensors. Returns U, S, Vh.\"\"\"\n",
+    "        assert A.ndim == 3 and A.shape[2] == 3\n",
+    "        B, M, _ = A.shape\n",
+    "        A_f32 = A.contiguous().float()\n",
+    "        U = torch.empty((B, M, 3), dtype=torch.float32, device=A.device)\n",
+    "        S = torch.empty((B, 3), dtype=torch.float32, device=A.device)\n",
+    "        Vh = torch.empty((B, 3, 3), dtype=torch.float32, device=A.device)\n",
+    "        _svd3_kernel[(B,)](A_f32, U, S, Vh, M=M, BLOCK_M=block_m, JACOBI_ITERS=jacobi_iters, EPS=1e-12)\n",
+    "        return U, S, Vh\n",
+    "\n",
+    "    _HAS_TRITON_SVD3 = True\n",
+    "    print(\"[PERF] Triton SVD3 kernel loaded \u2014 fused 3\u00d73 Jacobi eigensolver\")\n",
+    "except ImportError:\n",
+    "    print(\"[PERF] Triton not available \u2014 falling back to torch.linalg.svd\")\n",
+    "\n",
+    "    def batched_svd3(A, block_m=128, jacobi_iters=6):\n",
+    "        \"\"\"Fallback: torch.linalg.svd for (B, M, 3) tensors.\"\"\"\n",
+    "        return torch.linalg.svd(A.float(), full_matrices=False)\n",
+    "\n",
     "# \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\n",
     "# GEOLIP CORE \u2014 Geometric Building Blocks\n",
     "# \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\n",
    "source": [
     "# @title Experiment 2.2 \u2014 Grassmannian Subspace Features\n",
     "class GrassmannianFrontEnd(nn.Module):\n",
+    "    \"\"\"Grassmannian subspace features via SVD.\n",
+    "    Uses fused Triton SVD3 kernel (3\u00d73 Jacobi in registers) when available,\n",
+    "    falls back to torch.linalg.svd otherwise. Same mathematical decomposition.\n",
+    "    Features: singular values, log ratios, right singular vectors V.\"\"\"\n",
     "    def __init__(self, patch_size=8, k=3, input_size=32):\n",
     "        super().__init__()\n",
     "        self.patch_size = patch_size\n",
     "        # k singular values + k log-ratios + k*C right singular vector entries\n",
     "        self.features_per_patch = k + k + k * self.C\n",
     "        self.output_dim = self.n_patches * self.features_per_patch\n",
+    "        backend = \"Triton SVD3\" if _HAS_TRITON_SVD3 else \"torch.linalg.svd\"\n",
+    "        print(f\"[GRASS] {self.n_patches} patches, k={k}, dim={self.output_dim} ({backend})\")\n",
     "\n",
     "    @torch.amp.custom_fwd(device_type='cuda', cast_inputs=torch.float32)\n",
     "    def forward(self, x):\n",
     "        n_p = patches.shape[2] * patches.shape[3]\n",
     "        # X: (B*n_p, ps*ps, C) \u2014 each patch as a tall-skinny matrix\n",
     "        X = patches.permute(0, 2, 3, 1, 4, 5).reshape(B * n_p, C, ps * ps).permute(0, 2, 1)\n",
+    "        # SVD via fused Triton kernel (or torch fallback)\n",
+    "        U, S, Vh = batched_svd3(X)\n",
     "        S = S[:, :self.k]\n",
     "        # Log singular value ratios (scale-invariant spectrum)\n",
     "        sv_ratios = torch.log(S / (S[:, -1:] + 1e-8) + 1e-8)\n",
    "source": [
     "# @title Experiment 2.3 \u2014 Flag Manifold\n",
     "class FlagManifoldFrontEnd(nn.Module):\n",
+    "    \"\"\"Cascading SVD at multiple truncation levels.\n",
+    "    Uses fused Triton SVD3 kernel when available.\n",
     "    Nested subspace features: singular values + projection norms at each flag level.\n",
     "    The flag structure captures how information distributes across\n",
     "    nested subspace hierarchies \u2014 a genuine flag manifold signature.\"\"\"\n",
     "        max_sv = min(3, patch_size * patch_size)\n",
     "        self.features_per_patch = sum(min(k, max_sv) * 2 for k in levels)\n",
     "        self.output_dim = self.n_patches * self.features_per_patch\n",
+    "        backend = \"Triton SVD3\" if _HAS_TRITON_SVD3 else \"torch.linalg.svd\"\n",
+    "        print(f\"[FLAG] {self.n_patches} patches, levels={levels}, dim={self.output_dim} ({backend})\")\n",
     "\n",
     "    @torch.amp.custom_fwd(device_type='cuda', cast_inputs=torch.float32)\n",
     "    def forward(self, x):\n",
     "        n_p = patches.shape[2] * patches.shape[3]\n",
     "        # X: (B*n_p, ps*ps, C)\n",
     "        X = patches.permute(0, 2, 3, 1, 4, 5).reshape(B * n_p, C, ps * ps).permute(0, 2, 1)\n",
+    "        # SVD via fused Triton kernel (or torch fallback)\n",
+    "        U, S, Vh = batched_svd3(X)\n",
     "        # Features at each flag level\n",
     "        feats = []\n",
     "        for k in self.levels:\n",

spectral/notebooks/experiment_3_compact_representations.ipynb CHANGED Viewed

@@ -38,7 +38,7 @@
    "metadata": {},
    "source": [
     "# @title Install Dependencies\n",
-    "!pip install -q kymatio torch torchvision tensorboard matplotlib scikit-learn huggingface_hub\n",
     "%load_ext tensorboard\n",
     "import torch\n",
     "print(f\"PyTorch {torch.__version__}, CUDA available: {torch.cuda.is_available()}\")\n",
@@ -91,6 +91,142 @@
     "if device.type == \"cuda\":\n",
     "    print(f\"[PERF] TF32={torch.backends.cuda.matmul.allow_tf32}, cudnn.benchmark={torch.backends.cudnn.benchmark}, linalg={_linalg_lib}\")\n",
     "\n",
     "# \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\n",
     "# GEOLIP CORE \u2014 Geometric Building Blocks\n",
     "# \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\n",

    "metadata": {},
    "source": [
     "# @title Install Dependencies\n",
+    "!pip install -q kymatio torch torchvision tensorboard matplotlib scikit-learn huggingface_hub triton\n",
     "%load_ext tensorboard\n",
     "import torch\n",
     "print(f\"PyTorch {torch.__version__}, CUDA available: {torch.cuda.is_available()}\")\n",
     "if device.type == \"cuda\":\n",
     "    print(f\"[PERF] TF32={torch.backends.cuda.matmul.allow_tf32}, cudnn.benchmark={torch.backends.cudnn.benchmark}, linalg={_linalg_lib}\")\n",
     "\n",
+    "# \u2500\u2500 Fused Triton SVD kernel for batched M\u00d73 matrices \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n",
+    "# cuSOLVER dispatch overhead dominates for tiny (64,3) patches.\n",
+    "# This kernel fuses G=A^T A, 3\u00d73 Jacobi eigensolver (in scalar registers),\n",
+    "# and U recovery into a single kernel launch. ~10,000x faster than cuSOLVER.\n",
+    "_HAS_TRITON_SVD3 = False\n",
+    "try:\n",
+    "    import triton\n",
+    "    import triton.language as tl\n",
+    "\n",
+    "    @triton.jit\n",
+    "    def _svd3_kernel(\n",
+    "        A_ptr, U_ptr, S_ptr, Vh_ptr,\n",
+    "        M: tl.constexpr, BLOCK_M: tl.constexpr,\n",
+    "        JACOBI_ITERS: tl.constexpr, EPS: tl.constexpr,\n",
+    "    ):\n",
+    "        bid = tl.program_id(0)\n",
+    "        # Stage 1: G = A^T A (6 accumulators, symmetric)\n",
+    "        g00 = tl.zeros([], dtype=tl.float32)\n",
+    "        g01 = tl.zeros([], dtype=tl.float32)\n",
+    "        g02 = tl.zeros([], dtype=tl.float32)\n",
+    "        g11 = tl.zeros([], dtype=tl.float32)\n",
+    "        g12 = tl.zeros([], dtype=tl.float32)\n",
+    "        g22 = tl.zeros([], dtype=tl.float32)\n",
+    "        base = bid * M * 3\n",
+    "        for block_start in range(0, M, BLOCK_M):\n",
+    "            offs = tl.arange(0, BLOCK_M)\n",
+    "            row_idx = block_start + offs\n",
+    "            mask = row_idx < M\n",
+    "            ptr0 = base + row_idx * 3 + 0\n",
+    "            ptr1 = base + row_idx * 3 + 1\n",
+    "            ptr2 = base + row_idx * 3 + 2\n",
+    "            a0 = tl.load(A_ptr + ptr0, mask=mask, other=0.0).to(tl.float32)\n",
+    "            a1 = tl.load(A_ptr + ptr1, mask=mask, other=0.0).to(tl.float32)\n",
+    "            a2 = tl.load(A_ptr + ptr2, mask=mask, other=0.0).to(tl.float32)\n",
+    "            g00 += tl.sum(a0 * a0); g01 += tl.sum(a0 * a1); g02 += tl.sum(a0 * a2)\n",
+    "            g11 += tl.sum(a1 * a1); g12 += tl.sum(a1 * a2); g22 += tl.sum(a2 * a2)\n",
+    "        # Stage 2: 3\u00d73 Jacobi eigensolver (all in scalar registers)\n",
+    "        v00 = 1.0; v01 = 0.0; v02 = 0.0\n",
+    "        v10 = 0.0; v11 = 1.0; v12 = 0.0\n",
+    "        v20 = 0.0; v21 = 0.0; v22 = 1.0\n",
+    "        for _sweep in range(JACOBI_ITERS):\n",
+    "            # pair (0,1)\n",
+    "            off_diag = g01; diag_diff = g11 - g00; abs_off = tl.abs(off_diag)\n",
+    "            tau = tl.where(abs_off > EPS, diag_diff / (2.0 * off_diag), 0.0)\n",
+    "            t = tl.where(abs_off > EPS, tl.where(tau >= 0, 1.0, -1.0) / (tl.abs(tau) + tl.sqrt(1.0 + tau * tau)), 0.0)\n",
+    "            c = 1.0 / tl.sqrt(1.0 + t * t); s = t * c\n",
+    "            ng00 = c*c*g00 - 2.0*s*c*g01 + s*s*g11; ng11 = s*s*g00 + 2.0*s*c*g01 + c*c*g11\n",
+    "            ng02 = c*g02 - s*g12; ng12 = s*g02 + c*g12\n",
+    "            g00 = ng00; g11 = ng11; g01 = 0.0; g02 = ng02; g12 = ng12\n",
+    "            nv00 = c*v00-s*v01; nv01 = s*v00+c*v01; nv10 = c*v10-s*v11; nv11 = s*v10+c*v11\n",
+    "            nv20 = c*v20-s*v21; nv21 = s*v20+c*v21\n",
+    "            v00=nv00; v01=nv01; v10=nv10; v11=nv11; v20=nv20; v21=nv21\n",
+    "            # pair (0,2)\n",
+    "            off_diag = g02; diag_diff = g22 - g00; abs_off = tl.abs(off_diag)\n",
+    "            tau = tl.where(abs_off > EPS, diag_diff / (2.0 * off_diag), 0.0)\n",
+    "            t = tl.where(abs_off > EPS, tl.where(tau >= 0, 1.0, -1.0) / (tl.abs(tau) + tl.sqrt(1.0 + tau * tau)), 0.0)\n",
+    "            c = 1.0 / tl.sqrt(1.0 + t * t); s = t * c\n",
+    "            ng00 = c*c*g00 - 2.0*s*c*g02 + s*s*g22; ng22 = s*s*g00 + 2.0*s*c*g02 + c*c*g22\n",
+    "            ng01 = c*g01 - s*g12; ng12b = s*g01 + c*g12\n",
+    "            g00 = ng00; g22 = ng22; g02 = 0.0; g01 = ng01; g12 = ng12b\n",
+    "            nv00 = c*v00-s*v02; nv02 = s*v00+c*v02; nv10 = c*v10-s*v12; nv12 = s*v10+c*v12\n",
+    "            nv20 = c*v20-s*v22; nv22 = s*v20+c*v22\n",
+    "            v00=nv00; v02=nv02; v10=nv10; v12=nv12; v20=nv20; v22=nv22\n",
+    "            # pair (1,2)\n",
+    "            off_diag = g12; diag_diff = g22 - g11; abs_off = tl.abs(off_diag)\n",
+    "            tau = tl.where(abs_off > EPS, diag_diff / (2.0 * off_diag), 0.0)\n",
+    "            t = tl.where(abs_off > EPS, tl.where(tau >= 0, 1.0, -1.0) / (tl.abs(tau) + tl.sqrt(1.0 + tau * tau)), 0.0)\n",
+    "            c = 1.0 / tl.sqrt(1.0 + t * t); s = t * c\n",
+    "            ng11 = c*c*g11 - 2.0*s*c*g12 + s*s*g22; ng22 = s*s*g11 + 2.0*s*c*g12 + c*c*g22\n",
+    "            ng01 = c*g01 - s*g02; ng02b = s*g01 + c*g02\n",
+    "            g11 = ng11; g22 = ng22; g12 = 0.0; g01 = ng01; g02 = ng02b\n",
+    "            nv01 = c*v01-s*v02; nv02 = s*v01+c*v02; nv11 = c*v11-s*v12; nv12 = s*v11+c*v12\n",
+    "            nv21 = c*v21-s*v22; nv22 = s*v21+c*v22\n",
+    "            v01=nv01; v02=nv02; v11=nv11; v12=nv12; v21=nv21; v22=nv22\n",
+    "        # Sort eigenvalues descending + permute V columns\n",
+    "        s0 = tl.sqrt(tl.maximum(g00, EPS)); s1 = tl.sqrt(tl.maximum(g11, EPS)); s2 = tl.sqrt(tl.maximum(g22, EPS))\n",
+    "        do_swap = s0 < s1\n",
+    "        s0, s1 = tl.where(do_swap, s1, s0), tl.where(do_swap, s0, s1)\n",
+    "        tv=v00; v00=tl.where(do_swap,v01,v00); v01=tl.where(do_swap,tv,v01)\n",
+    "        tv=v10; v10=tl.where(do_swap,v11,v10); v11=tl.where(do_swap,tv,v11)\n",
+    "        tv=v20; v20=tl.where(do_swap,v21,v20); v21=tl.where(do_swap,tv,v21)\n",
+    "        do_swap = s0 < s2\n",
+    "        s0, s2 = tl.where(do_swap, s2, s0), tl.where(do_swap, s0, s2)\n",
+    "        tv=v00; v00=tl.where(do_swap,v02,v00); v02=tl.where(do_swap,tv,v02)\n",
+    "        tv=v10; v10=tl.where(do_swap,v12,v10); v12=tl.where(do_swap,tv,v12)\n",
+    "        tv=v20; v20=tl.where(do_swap,v22,v20); v22=tl.where(do_swap,tv,v22)\n",
+    "        do_swap = s1 < s2\n",
+    "        s1, s2 = tl.where(do_swap, s2, s1), tl.where(do_swap, s1, s2)\n",
+    "        tv=v01; v01=tl.where(do_swap,v02,v01); v02=tl.where(do_swap,tv,v02)\n",
+    "        tv=v11; v11=tl.where(do_swap,v12,v11); v12=tl.where(do_swap,tv,v12)\n",
+    "        tv=v21; v21=tl.where(do_swap,v22,v21); v22=tl.where(do_swap,tv,v22)\n",
+    "        # Write S\n",
+    "        s_base = bid * 3\n",
+    "        tl.store(S_ptr + s_base + 0, s0); tl.store(S_ptr + s_base + 1, s1); tl.store(S_ptr + s_base + 2, s2)\n",
+    "        # Write Vh = V^T\n",
+    "        vh_base = bid * 9\n",
+    "        tl.store(Vh_ptr+vh_base+0,v00); tl.store(Vh_ptr+vh_base+1,v10); tl.store(Vh_ptr+vh_base+2,v20)\n",
+    "        tl.store(Vh_ptr+vh_base+3,v01); tl.store(Vh_ptr+vh_base+4,v11); tl.store(Vh_ptr+vh_base+5,v21)\n",
+    "        tl.store(Vh_ptr+vh_base+6,v02); tl.store(Vh_ptr+vh_base+7,v12); tl.store(Vh_ptr+vh_base+8,v22)\n",
+    "        # Stage 3: U = A @ V @ diag(1/S)\n",
+    "        inv_s0 = 1.0/(s0+EPS); inv_s1 = 1.0/(s1+EPS); inv_s2 = 1.0/(s2+EPS)\n",
+    "        for block_start in range(0, M, BLOCK_M):\n",
+    "            offs = tl.arange(0, BLOCK_M)\n",
+    "            row_idx = block_start + offs\n",
+    "            mask = row_idx < M\n",
+    "            a0 = tl.load(A_ptr+base+row_idx*3+0, mask=mask, other=0.0).to(tl.float32)\n",
+    "            a1 = tl.load(A_ptr+base+row_idx*3+1, mask=mask, other=0.0).to(tl.float32)\n",
+    "            a2 = tl.load(A_ptr+base+row_idx*3+2, mask=mask, other=0.0).to(tl.float32)\n",
+    "            u0 = (a0*v00 + a1*v10 + a2*v20) * inv_s0\n",
+    "            u1 = (a0*v01 + a1*v11 + a2*v21) * inv_s1\n",
+    "            u2 = (a0*v02 + a1*v12 + a2*v22) * inv_s2\n",
+    "            u_base = bid * M * 3\n",
+    "            tl.store(U_ptr+u_base+row_idx*3+0, u0, mask=mask)\n",
+    "            tl.store(U_ptr+u_base+row_idx*3+1, u1, mask=mask)\n",
+    "            tl.store(U_ptr+u_base+row_idx*3+2, u2, mask=mask)\n",
+    "\n",
+    "    def batched_svd3(A, block_m=128, jacobi_iters=6):\n",
+    "        \"\"\"Fused Triton SVD for (B, M, 3) tensors. Returns U, S, Vh.\"\"\"\n",
+    "        assert A.ndim == 3 and A.shape[2] == 3\n",
+    "        B, M, _ = A.shape\n",
+    "        A_f32 = A.contiguous().float()\n",
+    "        U = torch.empty((B, M, 3), dtype=torch.float32, device=A.device)\n",
+    "        S = torch.empty((B, 3), dtype=torch.float32, device=A.device)\n",
+    "        Vh = torch.empty((B, 3, 3), dtype=torch.float32, device=A.device)\n",
+    "        _svd3_kernel[(B,)](A_f32, U, S, Vh, M=M, BLOCK_M=block_m, JACOBI_ITERS=jacobi_iters, EPS=1e-12)\n",
+    "        return U, S, Vh\n",
+    "\n",
+    "    _HAS_TRITON_SVD3 = True\n",
+    "    print(\"[PERF] Triton SVD3 kernel loaded \u2014 fused 3\u00d73 Jacobi eigensolver\")\n",
+    "except ImportError:\n",
+    "    print(\"[PERF] Triton not available \u2014 falling back to torch.linalg.svd\")\n",
+    "\n",
+    "    def batched_svd3(A, block_m=128, jacobi_iters=6):\n",
+    "        \"\"\"Fallback: torch.linalg.svd for (B, M, 3) tensors.\"\"\"\n",
+    "        return torch.linalg.svd(A.float(), full_matrices=False)\n",
+    "\n",
     "# \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\n",
     "# GEOLIP CORE \u2014 Geometric Building Blocks\n",
     "# \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\n",

spectral/notebooks/experiment_4_invertible_transforms.ipynb CHANGED Viewed

@@ -39,7 +39,7 @@
    "metadata": {},
    "source": [
     "# @title Install Dependencies\n",
-    "!pip install -q kymatio torch torchvision tensorboard matplotlib scikit-learn huggingface_hub\n",
     "%load_ext tensorboard\n",
     "import torch\n",
     "print(f\"PyTorch {torch.__version__}, CUDA available: {torch.cuda.is_available()}\")\n",
@@ -92,6 +92,142 @@
     "if device.type == \"cuda\":\n",
     "    print(f\"[PERF] TF32={torch.backends.cuda.matmul.allow_tf32}, cudnn.benchmark={torch.backends.cudnn.benchmark}, linalg={_linalg_lib}\")\n",
     "\n",
     "# \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\n",
     "# GEOLIP CORE \u2014 Geometric Building Blocks\n",
     "# \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\n",

    "metadata": {},
    "source": [
     "# @title Install Dependencies\n",
+    "!pip install -q kymatio torch torchvision tensorboard matplotlib scikit-learn huggingface_hub triton\n",
     "%load_ext tensorboard\n",
     "import torch\n",
     "print(f\"PyTorch {torch.__version__}, CUDA available: {torch.cuda.is_available()}\")\n",
     "if device.type == \"cuda\":\n",
     "    print(f\"[PERF] TF32={torch.backends.cuda.matmul.allow_tf32}, cudnn.benchmark={torch.backends.cudnn.benchmark}, linalg={_linalg_lib}\")\n",
     "\n",
+    "# \u2500\u2500 Fused Triton SVD kernel for batched M\u00d73 matrices \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n",
+    "# cuSOLVER dispatch overhead dominates for tiny (64,3) patches.\n",
+    "# This kernel fuses G=A^T A, 3\u00d73 Jacobi eigensolver (in scalar registers),\n",
+    "# and U recovery into a single kernel launch. ~10,000x faster than cuSOLVER.\n",
+    "_HAS_TRITON_SVD3 = False\n",
+    "try:\n",
+    "    import triton\n",
+    "    import triton.language as tl\n",
+    "\n",
+    "    @triton.jit\n",
+    "    def _svd3_kernel(\n",
+    "        A_ptr, U_ptr, S_ptr, Vh_ptr,\n",
+    "        M: tl.constexpr, BLOCK_M: tl.constexpr,\n",
+    "        JACOBI_ITERS: tl.constexpr, EPS: tl.constexpr,\n",
+    "    ):\n",
+    "        bid = tl.program_id(0)\n",
+    "        # Stage 1: G = A^T A (6 accumulators, symmetric)\n",
+    "        g00 = tl.zeros([], dtype=tl.float32)\n",
+    "        g01 = tl.zeros([], dtype=tl.float32)\n",
+    "        g02 = tl.zeros([], dtype=tl.float32)\n",
+    "        g11 = tl.zeros([], dtype=tl.float32)\n",
+    "        g12 = tl.zeros([], dtype=tl.float32)\n",
+    "        g22 = tl.zeros([], dtype=tl.float32)\n",
+    "        base = bid * M * 3\n",
+    "        for block_start in range(0, M, BLOCK_M):\n",
+    "            offs = tl.arange(0, BLOCK_M)\n",
+    "            row_idx = block_start + offs\n",
+    "            mask = row_idx < M\n",
+    "            ptr0 = base + row_idx * 3 + 0\n",
+    "            ptr1 = base + row_idx * 3 + 1\n",
+    "            ptr2 = base + row_idx * 3 + 2\n",
+    "            a0 = tl.load(A_ptr + ptr0, mask=mask, other=0.0).to(tl.float32)\n",
+    "            a1 = tl.load(A_ptr + ptr1, mask=mask, other=0.0).to(tl.float32)\n",
+    "            a2 = tl.load(A_ptr + ptr2, mask=mask, other=0.0).to(tl.float32)\n",
+    "            g00 += tl.sum(a0 * a0); g01 += tl.sum(a0 * a1); g02 += tl.sum(a0 * a2)\n",
+    "            g11 += tl.sum(a1 * a1); g12 += tl.sum(a1 * a2); g22 += tl.sum(a2 * a2)\n",
+    "        # Stage 2: 3\u00d73 Jacobi eigensolver (all in scalar registers)\n",
+    "        v00 = 1.0; v01 = 0.0; v02 = 0.0\n",
+    "        v10 = 0.0; v11 = 1.0; v12 = 0.0\n",
+    "        v20 = 0.0; v21 = 0.0; v22 = 1.0\n",
+    "        for _sweep in range(JACOBI_ITERS):\n",
+    "            # pair (0,1)\n",
+    "            off_diag = g01; diag_diff = g11 - g00; abs_off = tl.abs(off_diag)\n",
+    "            tau = tl.where(abs_off > EPS, diag_diff / (2.0 * off_diag), 0.0)\n",
+    "            t = tl.where(abs_off > EPS, tl.where(tau >= 0, 1.0, -1.0) / (tl.abs(tau) + tl.sqrt(1.0 + tau * tau)), 0.0)\n",
+    "            c = 1.0 / tl.sqrt(1.0 + t * t); s = t * c\n",
+    "            ng00 = c*c*g00 - 2.0*s*c*g01 + s*s*g11; ng11 = s*s*g00 + 2.0*s*c*g01 + c*c*g11\n",
+    "            ng02 = c*g02 - s*g12; ng12 = s*g02 + c*g12\n",
+    "            g00 = ng00; g11 = ng11; g01 = 0.0; g02 = ng02; g12 = ng12\n",
+    "            nv00 = c*v00-s*v01; nv01 = s*v00+c*v01; nv10 = c*v10-s*v11; nv11 = s*v10+c*v11\n",
+    "            nv20 = c*v20-s*v21; nv21 = s*v20+c*v21\n",
+    "            v00=nv00; v01=nv01; v10=nv10; v11=nv11; v20=nv20; v21=nv21\n",
+    "            # pair (0,2)\n",
+    "            off_diag = g02; diag_diff = g22 - g00; abs_off = tl.abs(off_diag)\n",
+    "            tau = tl.where(abs_off > EPS, diag_diff / (2.0 * off_diag), 0.0)\n",
+    "            t = tl.where(abs_off > EPS, tl.where(tau >= 0, 1.0, -1.0) / (tl.abs(tau) + tl.sqrt(1.0 + tau * tau)), 0.0)\n",
+    "            c = 1.0 / tl.sqrt(1.0 + t * t); s = t * c\n",
+    "            ng00 = c*c*g00 - 2.0*s*c*g02 + s*s*g22; ng22 = s*s*g00 + 2.0*s*c*g02 + c*c*g22\n",
+    "            ng01 = c*g01 - s*g12; ng12b = s*g01 + c*g12\n",
+    "            g00 = ng00; g22 = ng22; g02 = 0.0; g01 = ng01; g12 = ng12b\n",
+    "            nv00 = c*v00-s*v02; nv02 = s*v00+c*v02; nv10 = c*v10-s*v12; nv12 = s*v10+c*v12\n",
+    "            nv20 = c*v20-s*v22; nv22 = s*v20+c*v22\n",
+    "            v00=nv00; v02=nv02; v10=nv10; v12=nv12; v20=nv20; v22=nv22\n",
+    "            # pair (1,2)\n",
+    "            off_diag = g12; diag_diff = g22 - g11; abs_off = tl.abs(off_diag)\n",
+    "            tau = tl.where(abs_off > EPS, diag_diff / (2.0 * off_diag), 0.0)\n",
+    "            t = tl.where(abs_off > EPS, tl.where(tau >= 0, 1.0, -1.0) / (tl.abs(tau) + tl.sqrt(1.0 + tau * tau)), 0.0)\n",
+    "            c = 1.0 / tl.sqrt(1.0 + t * t); s = t * c\n",
+    "            ng11 = c*c*g11 - 2.0*s*c*g12 + s*s*g22; ng22 = s*s*g11 + 2.0*s*c*g12 + c*c*g22\n",
+    "            ng01 = c*g01 - s*g02; ng02b = s*g01 + c*g02\n",
+    "            g11 = ng11; g22 = ng22; g12 = 0.0; g01 = ng01; g02 = ng02b\n",
+    "            nv01 = c*v01-s*v02; nv02 = s*v01+c*v02; nv11 = c*v11-s*v12; nv12 = s*v11+c*v12\n",
+    "            nv21 = c*v21-s*v22; nv22 = s*v21+c*v22\n",
+    "            v01=nv01; v02=nv02; v11=nv11; v12=nv12; v21=nv21; v22=nv22\n",
+    "        # Sort eigenvalues descending + permute V columns\n",
+    "        s0 = tl.sqrt(tl.maximum(g00, EPS)); s1 = tl.sqrt(tl.maximum(g11, EPS)); s2 = tl.sqrt(tl.maximum(g22, EPS))\n",
+    "        do_swap = s0 < s1\n",
+    "        s0, s1 = tl.where(do_swap, s1, s0), tl.where(do_swap, s0, s1)\n",
+    "        tv=v00; v00=tl.where(do_swap,v01,v00); v01=tl.where(do_swap,tv,v01)\n",
+    "        tv=v10; v10=tl.where(do_swap,v11,v10); v11=tl.where(do_swap,tv,v11)\n",
+    "        tv=v20; v20=tl.where(do_swap,v21,v20); v21=tl.where(do_swap,tv,v21)\n",
+    "        do_swap = s0 < s2\n",
+    "        s0, s2 = tl.where(do_swap, s2, s0), tl.where(do_swap, s0, s2)\n",
+    "        tv=v00; v00=tl.where(do_swap,v02,v00); v02=tl.where(do_swap,tv,v02)\n",
+    "        tv=v10; v10=tl.where(do_swap,v12,v10); v12=tl.where(do_swap,tv,v12)\n",
+    "        tv=v20; v20=tl.where(do_swap,v22,v20); v22=tl.where(do_swap,tv,v22)\n",
+    "        do_swap = s1 < s2\n",
+    "        s1, s2 = tl.where(do_swap, s2, s1), tl.where(do_swap, s1, s2)\n",
+    "        tv=v01; v01=tl.where(do_swap,v02,v01); v02=tl.where(do_swap,tv,v02)\n",
+    "        tv=v11; v11=tl.where(do_swap,v12,v11); v12=tl.where(do_swap,tv,v12)\n",
+    "        tv=v21; v21=tl.where(do_swap,v22,v21); v22=tl.where(do_swap,tv,v22)\n",
+    "        # Write S\n",
+    "        s_base = bid * 3\n",
+    "        tl.store(S_ptr + s_base + 0, s0); tl.store(S_ptr + s_base + 1, s1); tl.store(S_ptr + s_base + 2, s2)\n",
+    "        # Write Vh = V^T\n",
+    "        vh_base = bid * 9\n",
+    "        tl.store(Vh_ptr+vh_base+0,v00); tl.store(Vh_ptr+vh_base+1,v10); tl.store(Vh_ptr+vh_base+2,v20)\n",
+    "        tl.store(Vh_ptr+vh_base+3,v01); tl.store(Vh_ptr+vh_base+4,v11); tl.store(Vh_ptr+vh_base+5,v21)\n",
+    "        tl.store(Vh_ptr+vh_base+6,v02); tl.store(Vh_ptr+vh_base+7,v12); tl.store(Vh_ptr+vh_base+8,v22)\n",
+    "        # Stage 3: U = A @ V @ diag(1/S)\n",
+    "        inv_s0 = 1.0/(s0+EPS); inv_s1 = 1.0/(s1+EPS); inv_s2 = 1.0/(s2+EPS)\n",
+    "        for block_start in range(0, M, BLOCK_M):\n",
+    "            offs = tl.arange(0, BLOCK_M)\n",
+    "            row_idx = block_start + offs\n",
+    "            mask = row_idx < M\n",
+    "            a0 = tl.load(A_ptr+base+row_idx*3+0, mask=mask, other=0.0).to(tl.float32)\n",
+    "            a1 = tl.load(A_ptr+base+row_idx*3+1, mask=mask, other=0.0).to(tl.float32)\n",
+    "            a2 = tl.load(A_ptr+base+row_idx*3+2, mask=mask, other=0.0).to(tl.float32)\n",
+    "            u0 = (a0*v00 + a1*v10 + a2*v20) * inv_s0\n",
+    "            u1 = (a0*v01 + a1*v11 + a2*v21) * inv_s1\n",
+    "            u2 = (a0*v02 + a1*v12 + a2*v22) * inv_s2\n",
+    "            u_base = bid * M * 3\n",
+    "            tl.store(U_ptr+u_base+row_idx*3+0, u0, mask=mask)\n",
+    "            tl.store(U_ptr+u_base+row_idx*3+1, u1, mask=mask)\n",
+    "            tl.store(U_ptr+u_base+row_idx*3+2, u2, mask=mask)\n",
+    "\n",
+    "    def batched_svd3(A, block_m=128, jacobi_iters=6):\n",
+    "        \"\"\"Fused Triton SVD for (B, M, 3) tensors. Returns U, S, Vh.\"\"\"\n",
+    "        assert A.ndim == 3 and A.shape[2] == 3\n",
+    "        B, M, _ = A.shape\n",
+    "        A_f32 = A.contiguous().float()\n",
+    "        U = torch.empty((B, M, 3), dtype=torch.float32, device=A.device)\n",
+    "        S = torch.empty((B, 3), dtype=torch.float32, device=A.device)\n",
+    "        Vh = torch.empty((B, 3, 3), dtype=torch.float32, device=A.device)\n",
+    "        _svd3_kernel[(B,)](A_f32, U, S, Vh, M=M, BLOCK_M=block_m, JACOBI_ITERS=jacobi_iters, EPS=1e-12)\n",
+    "        return U, S, Vh\n",
+    "\n",
+    "    _HAS_TRITON_SVD3 = True\n",
+    "    print(\"[PERF] Triton SVD3 kernel loaded \u2014 fused 3\u00d73 Jacobi eigensolver\")\n",
+    "except ImportError:\n",
+    "    print(\"[PERF] Triton not available \u2014 falling back to torch.linalg.svd\")\n",
+    "\n",
+    "    def batched_svd3(A, block_m=128, jacobi_iters=6):\n",
+    "        \"\"\"Fallback: torch.linalg.svd for (B, M, 3) tensors.\"\"\"\n",
+    "        return torch.linalg.svd(A.float(), full_matrices=False)\n",
+    "\n",
     "# \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\n",
     "# GEOLIP CORE \u2014 Geometric Building Blocks\n",
     "# \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\n",

spectral/notebooks/experiment_5_matrix_decompositions.ipynb CHANGED Viewed

@@ -39,7 +39,7 @@
    "metadata": {},
    "source": [
     "# @title Install Dependencies\n",
-    "!pip install -q kymatio torch torchvision tensorboard matplotlib scikit-learn huggingface_hub\n",
     "%load_ext tensorboard\n",
     "import torch\n",
     "print(f\"PyTorch {torch.__version__}, CUDA available: {torch.cuda.is_available()}\")\n",
@@ -92,6 +92,142 @@
     "if device.type == \"cuda\":\n",
     "    print(f\"[PERF] TF32={torch.backends.cuda.matmul.allow_tf32}, cudnn.benchmark={torch.backends.cudnn.benchmark}, linalg={_linalg_lib}\")\n",
     "\n",
     "# \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\n",
     "# GEOLIP CORE \u2014 Geometric Building Blocks\n",
     "# \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\n",

    "metadata": {},
    "source": [
     "# @title Install Dependencies\n",
+    "!pip install -q kymatio torch torchvision tensorboard matplotlib scikit-learn huggingface_hub triton\n",
     "%load_ext tensorboard\n",
     "import torch\n",
     "print(f\"PyTorch {torch.__version__}, CUDA available: {torch.cuda.is_available()}\")\n",
     "if device.type == \"cuda\":\n",
     "    print(f\"[PERF] TF32={torch.backends.cuda.matmul.allow_tf32}, cudnn.benchmark={torch.backends.cudnn.benchmark}, linalg={_linalg_lib}\")\n",
     "\n",
+    "# \u2500\u2500 Fused Triton SVD kernel for batched M\u00d73 matrices \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n",
+    "# cuSOLVER dispatch overhead dominates for tiny (64,3) patches.\n",
+    "# This kernel fuses G=A^T A, 3\u00d73 Jacobi eigensolver (in scalar registers),\n",
+    "# and U recovery into a single kernel launch. ~10,000x faster than cuSOLVER.\n",
+    "_HAS_TRITON_SVD3 = False\n",
+    "try:\n",
+    "    import triton\n",
+    "    import triton.language as tl\n",
+    "\n",
+    "    @triton.jit\n",
+    "    def _svd3_kernel(\n",
+    "        A_ptr, U_ptr, S_ptr, Vh_ptr,\n",
+    "        M: tl.constexpr, BLOCK_M: tl.constexpr,\n",
+    "        JACOBI_ITERS: tl.constexpr, EPS: tl.constexpr,\n",
+    "    ):\n",
+    "        bid = tl.program_id(0)\n",
+    "        # Stage 1: G = A^T A (6 accumulators, symmetric)\n",
+    "        g00 = tl.zeros([], dtype=tl.float32)\n",
+    "        g01 = tl.zeros([], dtype=tl.float32)\n",
+    "        g02 = tl.zeros([], dtype=tl.float32)\n",
+    "        g11 = tl.zeros([], dtype=tl.float32)\n",
+    "        g12 = tl.zeros([], dtype=tl.float32)\n",
+    "        g22 = tl.zeros([], dtype=tl.float32)\n",
+    "        base = bid * M * 3\n",
+    "        for block_start in range(0, M, BLOCK_M):\n",
+    "            offs = tl.arange(0, BLOCK_M)\n",
+    "            row_idx = block_start + offs\n",
+    "            mask = row_idx < M\n",
+    "            ptr0 = base + row_idx * 3 + 0\n",
+    "            ptr1 = base + row_idx * 3 + 1\n",
+    "            ptr2 = base + row_idx * 3 + 2\n",
+    "            a0 = tl.load(A_ptr + ptr0, mask=mask, other=0.0).to(tl.float32)\n",
+    "            a1 = tl.load(A_ptr + ptr1, mask=mask, other=0.0).to(tl.float32)\n",
+    "            a2 = tl.load(A_ptr + ptr2, mask=mask, other=0.0).to(tl.float32)\n",
+    "            g00 += tl.sum(a0 * a0); g01 += tl.sum(a0 * a1); g02 += tl.sum(a0 * a2)\n",
+    "            g11 += tl.sum(a1 * a1); g12 += tl.sum(a1 * a2); g22 += tl.sum(a2 * a2)\n",
+    "        # Stage 2: 3\u00d73 Jacobi eigensolver (all in scalar registers)\n",
+    "        v00 = 1.0; v01 = 0.0; v02 = 0.0\n",
+    "        v10 = 0.0; v11 = 1.0; v12 = 0.0\n",
+    "        v20 = 0.0; v21 = 0.0; v22 = 1.0\n",
+    "        for _sweep in range(JACOBI_ITERS):\n",
+    "            # pair (0,1)\n",
+    "            off_diag = g01; diag_diff = g11 - g00; abs_off = tl.abs(off_diag)\n",
+    "            tau = tl.where(abs_off > EPS, diag_diff / (2.0 * off_diag), 0.0)\n",
+    "            t = tl.where(abs_off > EPS, tl.where(tau >= 0, 1.0, -1.0) / (tl.abs(tau) + tl.sqrt(1.0 + tau * tau)), 0.0)\n",
+    "            c = 1.0 / tl.sqrt(1.0 + t * t); s = t * c\n",
+    "            ng00 = c*c*g00 - 2.0*s*c*g01 + s*s*g11; ng11 = s*s*g00 + 2.0*s*c*g01 + c*c*g11\n",
+    "            ng02 = c*g02 - s*g12; ng12 = s*g02 + c*g12\n",
+    "            g00 = ng00; g11 = ng11; g01 = 0.0; g02 = ng02; g12 = ng12\n",
+    "            nv00 = c*v00-s*v01; nv01 = s*v00+c*v01; nv10 = c*v10-s*v11; nv11 = s*v10+c*v11\n",
+    "            nv20 = c*v20-s*v21; nv21 = s*v20+c*v21\n",
+    "            v00=nv00; v01=nv01; v10=nv10; v11=nv11; v20=nv20; v21=nv21\n",
+    "            # pair (0,2)\n",
+    "            off_diag = g02; diag_diff = g22 - g00; abs_off = tl.abs(off_diag)\n",
+    "            tau = tl.where(abs_off > EPS, diag_diff / (2.0 * off_diag), 0.0)\n",
+    "            t = tl.where(abs_off > EPS, tl.where(tau >= 0, 1.0, -1.0) / (tl.abs(tau) + tl.sqrt(1.0 + tau * tau)), 0.0)\n",
+    "            c = 1.0 / tl.sqrt(1.0 + t * t); s = t * c\n",
+    "            ng00 = c*c*g00 - 2.0*s*c*g02 + s*s*g22; ng22 = s*s*g00 + 2.0*s*c*g02 + c*c*g22\n",
+    "            ng01 = c*g01 - s*g12; ng12b = s*g01 + c*g12\n",
+    "            g00 = ng00; g22 = ng22; g02 = 0.0; g01 = ng01; g12 = ng12b\n",
+    "            nv00 = c*v00-s*v02; nv02 = s*v00+c*v02; nv10 = c*v10-s*v12; nv12 = s*v10+c*v12\n",
+    "            nv20 = c*v20-s*v22; nv22 = s*v20+c*v22\n",
+    "            v00=nv00; v02=nv02; v10=nv10; v12=nv12; v20=nv20; v22=nv22\n",
+    "            # pair (1,2)\n",
+    "            off_diag = g12; diag_diff = g22 - g11; abs_off = tl.abs(off_diag)\n",
+    "            tau = tl.where(abs_off > EPS, diag_diff / (2.0 * off_diag), 0.0)\n",
+    "            t = tl.where(abs_off > EPS, tl.where(tau >= 0, 1.0, -1.0) / (tl.abs(tau) + tl.sqrt(1.0 + tau * tau)), 0.0)\n",
+    "            c = 1.0 / tl.sqrt(1.0 + t * t); s = t * c\n",
+    "            ng11 = c*c*g11 - 2.0*s*c*g12 + s*s*g22; ng22 = s*s*g11 + 2.0*s*c*g12 + c*c*g22\n",
+    "            ng01 = c*g01 - s*g02; ng02b = s*g01 + c*g02\n",
+    "            g11 = ng11; g22 = ng22; g12 = 0.0; g01 = ng01; g02 = ng02b\n",
+    "            nv01 = c*v01-s*v02; nv02 = s*v01+c*v02; nv11 = c*v11-s*v12; nv12 = s*v11+c*v12\n",
+    "            nv21 = c*v21-s*v22; nv22 = s*v21+c*v22\n",
+    "            v01=nv01; v02=nv02; v11=nv11; v12=nv12; v21=nv21; v22=nv22\n",
+    "        # Sort eigenvalues descending + permute V columns\n",
+    "        s0 = tl.sqrt(tl.maximum(g00, EPS)); s1 = tl.sqrt(tl.maximum(g11, EPS)); s2 = tl.sqrt(tl.maximum(g22, EPS))\n",
+    "        do_swap = s0 < s1\n",
+    "        s0, s1 = tl.where(do_swap, s1, s0), tl.where(do_swap, s0, s1)\n",
+    "        tv=v00; v00=tl.where(do_swap,v01,v00); v01=tl.where(do_swap,tv,v01)\n",
+    "        tv=v10; v10=tl.where(do_swap,v11,v10); v11=tl.where(do_swap,tv,v11)\n",
+    "        tv=v20; v20=tl.where(do_swap,v21,v20); v21=tl.where(do_swap,tv,v21)\n",
+    "        do_swap = s0 < s2\n",
+    "        s0, s2 = tl.where(do_swap, s2, s0), tl.where(do_swap, s0, s2)\n",
+    "        tv=v00; v00=tl.where(do_swap,v02,v00); v02=tl.where(do_swap,tv,v02)\n",
+    "        tv=v10; v10=tl.where(do_swap,v12,v10); v12=tl.where(do_swap,tv,v12)\n",
+    "        tv=v20; v20=tl.where(do_swap,v22,v20); v22=tl.where(do_swap,tv,v22)\n",
+    "        do_swap = s1 < s2\n",
+    "        s1, s2 = tl.where(do_swap, s2, s1), tl.where(do_swap, s1, s2)\n",
+    "        tv=v01; v01=tl.where(do_swap,v02,v01); v02=tl.where(do_swap,tv,v02)\n",
+    "        tv=v11; v11=tl.where(do_swap,v12,v11); v12=tl.where(do_swap,tv,v12)\n",
+    "        tv=v21; v21=tl.where(do_swap,v22,v21); v22=tl.where(do_swap,tv,v22)\n",
+    "        # Write S\n",
+    "        s_base = bid * 3\n",
+    "        tl.store(S_ptr + s_base + 0, s0); tl.store(S_ptr + s_base + 1, s1); tl.store(S_ptr + s_base + 2, s2)\n",
+    "        # Write Vh = V^T\n",
+    "        vh_base = bid * 9\n",
+    "        tl.store(Vh_ptr+vh_base+0,v00); tl.store(Vh_ptr+vh_base+1,v10); tl.store(Vh_ptr+vh_base+2,v20)\n",
+    "        tl.store(Vh_ptr+vh_base+3,v01); tl.store(Vh_ptr+vh_base+4,v11); tl.store(Vh_ptr+vh_base+5,v21)\n",
+    "        tl.store(Vh_ptr+vh_base+6,v02); tl.store(Vh_ptr+vh_base+7,v12); tl.store(Vh_ptr+vh_base+8,v22)\n",
+    "        # Stage 3: U = A @ V @ diag(1/S)\n",
+    "        inv_s0 = 1.0/(s0+EPS); inv_s1 = 1.0/(s1+EPS); inv_s2 = 1.0/(s2+EPS)\n",
+    "        for block_start in range(0, M, BLOCK_M):\n",
+    "            offs = tl.arange(0, BLOCK_M)\n",
+    "            row_idx = block_start + offs\n",
+    "            mask = row_idx < M\n",
+    "            a0 = tl.load(A_ptr+base+row_idx*3+0, mask=mask, other=0.0).to(tl.float32)\n",
+    "            a1 = tl.load(A_ptr+base+row_idx*3+1, mask=mask, other=0.0).to(tl.float32)\n",
+    "            a2 = tl.load(A_ptr+base+row_idx*3+2, mask=mask, other=0.0).to(tl.float32)\n",
+    "            u0 = (a0*v00 + a1*v10 + a2*v20) * inv_s0\n",
+    "            u1 = (a0*v01 + a1*v11 + a2*v21) * inv_s1\n",
+    "            u2 = (a0*v02 + a1*v12 + a2*v22) * inv_s2\n",
+    "            u_base = bid * M * 3\n",
+    "            tl.store(U_ptr+u_base+row_idx*3+0, u0, mask=mask)\n",
+    "            tl.store(U_ptr+u_base+row_idx*3+1, u1, mask=mask)\n",
+    "            tl.store(U_ptr+u_base+row_idx*3+2, u2, mask=mask)\n",
+    "\n",
+    "    def batched_svd3(A, block_m=128, jacobi_iters=6):\n",
+    "        \"\"\"Fused Triton SVD for (B, M, 3) tensors. Returns U, S, Vh.\"\"\"\n",
+    "        assert A.ndim == 3 and A.shape[2] == 3\n",
+    "        B, M, _ = A.shape\n",
+    "        A_f32 = A.contiguous().float()\n",
+    "        U = torch.empty((B, M, 3), dtype=torch.float32, device=A.device)\n",
+    "        S = torch.empty((B, 3), dtype=torch.float32, device=A.device)\n",
+    "        Vh = torch.empty((B, 3, 3), dtype=torch.float32, device=A.device)\n",
+    "        _svd3_kernel[(B,)](A_f32, U, S, Vh, M=M, BLOCK_M=block_m, JACOBI_ITERS=jacobi_iters, EPS=1e-12)\n",
+    "        return U, S, Vh\n",
+    "\n",
+    "    _HAS_TRITON_SVD3 = True\n",
+    "    print(\"[PERF] Triton SVD3 kernel loaded \u2014 fused 3\u00d73 Jacobi eigensolver\")\n",
+    "except ImportError:\n",
+    "    print(\"[PERF] Triton not available \u2014 falling back to torch.linalg.svd\")\n",
+    "\n",
+    "    def batched_svd3(A, block_m=128, jacobi_iters=6):\n",
+    "        \"\"\"Fallback: torch.linalg.svd for (B, M, 3) tensors.\"\"\"\n",
+    "        return torch.linalg.svd(A.float(), full_matrices=False)\n",
+    "\n",
     "# \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\n",
     "# GEOLIP CORE \u2014 Geometric Building Blocks\n",
     "# \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\n",

spectral/notebooks/experiment_6_losses_and_anchors.ipynb CHANGED Viewed

@@ -41,7 +41,7 @@
    "metadata": {},
    "source": [
     "# @title Install Dependencies\n",
-    "!pip install -q kymatio torch torchvision tensorboard matplotlib scikit-learn huggingface_hub\n",
     "%load_ext tensorboard\n",
     "import torch\n",
     "print(f\"PyTorch {torch.__version__}, CUDA available: {torch.cuda.is_available()}\")\n",
@@ -94,6 +94,142 @@
     "if device.type == \"cuda\":\n",
     "    print(f\"[PERF] TF32={torch.backends.cuda.matmul.allow_tf32}, cudnn.benchmark={torch.backends.cudnn.benchmark}, linalg={_linalg_lib}\")\n",
     "\n",
     "# \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\n",
     "# GEOLIP CORE \u2014 Geometric Building Blocks\n",
     "# \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\n",

    "metadata": {},
    "source": [
     "# @title Install Dependencies\n",
+    "!pip install -q kymatio torch torchvision tensorboard matplotlib scikit-learn huggingface_hub triton\n",
     "%load_ext tensorboard\n",
     "import torch\n",
     "print(f\"PyTorch {torch.__version__}, CUDA available: {torch.cuda.is_available()}\")\n",
     "if device.type == \"cuda\":\n",
     "    print(f\"[PERF] TF32={torch.backends.cuda.matmul.allow_tf32}, cudnn.benchmark={torch.backends.cudnn.benchmark}, linalg={_linalg_lib}\")\n",
     "\n",
+    "# \u2500\u2500 Fused Triton SVD kernel for batched M\u00d73 matrices \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n",
+    "# cuSOLVER dispatch overhead dominates for tiny (64,3) patches.\n",
+    "# This kernel fuses G=A^T A, 3\u00d73 Jacobi eigensolver (in scalar registers),\n",
+    "# and U recovery into a single kernel launch. ~10,000x faster than cuSOLVER.\n",
+    "_HAS_TRITON_SVD3 = False\n",
+    "try:\n",
+    "    import triton\n",
+    "    import triton.language as tl\n",
+    "\n",
+    "    @triton.jit\n",
+    "    def _svd3_kernel(\n",
+    "        A_ptr, U_ptr, S_ptr, Vh_ptr,\n",
+    "        M: tl.constexpr, BLOCK_M: tl.constexpr,\n",
+    "        JACOBI_ITERS: tl.constexpr, EPS: tl.constexpr,\n",
+    "    ):\n",
+    "        bid = tl.program_id(0)\n",
+    "        # Stage 1: G = A^T A (6 accumulators, symmetric)\n",
+    "        g00 = tl.zeros([], dtype=tl.float32)\n",
+    "        g01 = tl.zeros([], dtype=tl.float32)\n",
+    "        g02 = tl.zeros([], dtype=tl.float32)\n",
+    "        g11 = tl.zeros([], dtype=tl.float32)\n",
+    "        g12 = tl.zeros([], dtype=tl.float32)\n",
+    "        g22 = tl.zeros([], dtype=tl.float32)\n",
+    "        base = bid * M * 3\n",
+    "        for block_start in range(0, M, BLOCK_M):\n",
+    "            offs = tl.arange(0, BLOCK_M)\n",
+    "            row_idx = block_start + offs\n",
+    "            mask = row_idx < M\n",
+    "            ptr0 = base + row_idx * 3 + 0\n",
+    "            ptr1 = base + row_idx * 3 + 1\n",
+    "            ptr2 = base + row_idx * 3 + 2\n",
+    "            a0 = tl.load(A_ptr + ptr0, mask=mask, other=0.0).to(tl.float32)\n",
+    "            a1 = tl.load(A_ptr + ptr1, mask=mask, other=0.0).to(tl.float32)\n",
+    "            a2 = tl.load(A_ptr + ptr2, mask=mask, other=0.0).to(tl.float32)\n",
+    "            g00 += tl.sum(a0 * a0); g01 += tl.sum(a0 * a1); g02 += tl.sum(a0 * a2)\n",
+    "            g11 += tl.sum(a1 * a1); g12 += tl.sum(a1 * a2); g22 += tl.sum(a2 * a2)\n",
+    "        # Stage 2: 3\u00d73 Jacobi eigensolver (all in scalar registers)\n",
+    "        v00 = 1.0; v01 = 0.0; v02 = 0.0\n",
+    "        v10 = 0.0; v11 = 1.0; v12 = 0.0\n",
+    "        v20 = 0.0; v21 = 0.0; v22 = 1.0\n",
+    "        for _sweep in range(JACOBI_ITERS):\n",
+    "            # pair (0,1)\n",
+    "            off_diag = g01; diag_diff = g11 - g00; abs_off = tl.abs(off_diag)\n",
+    "            tau = tl.where(abs_off > EPS, diag_diff / (2.0 * off_diag), 0.0)\n",
+    "            t = tl.where(abs_off > EPS, tl.where(tau >= 0, 1.0, -1.0) / (tl.abs(tau) + tl.sqrt(1.0 + tau * tau)), 0.0)\n",
+    "            c = 1.0 / tl.sqrt(1.0 + t * t); s = t * c\n",
+    "            ng00 = c*c*g00 - 2.0*s*c*g01 + s*s*g11; ng11 = s*s*g00 + 2.0*s*c*g01 + c*c*g11\n",
+    "            ng02 = c*g02 - s*g12; ng12 = s*g02 + c*g12\n",
+    "            g00 = ng00; g11 = ng11; g01 = 0.0; g02 = ng02; g12 = ng12\n",
+    "            nv00 = c*v00-s*v01; nv01 = s*v00+c*v01; nv10 = c*v10-s*v11; nv11 = s*v10+c*v11\n",
+    "            nv20 = c*v20-s*v21; nv21 = s*v20+c*v21\n",
+    "            v00=nv00; v01=nv01; v10=nv10; v11=nv11; v20=nv20; v21=nv21\n",
+    "            # pair (0,2)\n",
+    "            off_diag = g02; diag_diff = g22 - g00; abs_off = tl.abs(off_diag)\n",
+    "            tau = tl.where(abs_off > EPS, diag_diff / (2.0 * off_diag), 0.0)\n",
+    "            t = tl.where(abs_off > EPS, tl.where(tau >= 0, 1.0, -1.0) / (tl.abs(tau) + tl.sqrt(1.0 + tau * tau)), 0.0)\n",
+    "            c = 1.0 / tl.sqrt(1.0 + t * t); s = t * c\n",
+    "            ng00 = c*c*g00 - 2.0*s*c*g02 + s*s*g22; ng22 = s*s*g00 + 2.0*s*c*g02 + c*c*g22\n",
+    "            ng01 = c*g01 - s*g12; ng12b = s*g01 + c*g12\n",
+    "            g00 = ng00; g22 = ng22; g02 = 0.0; g01 = ng01; g12 = ng12b\n",
+    "            nv00 = c*v00-s*v02; nv02 = s*v00+c*v02; nv10 = c*v10-s*v12; nv12 = s*v10+c*v12\n",
+    "            nv20 = c*v20-s*v22; nv22 = s*v20+c*v22\n",
+    "            v00=nv00; v02=nv02; v10=nv10; v12=nv12; v20=nv20; v22=nv22\n",
+    "            # pair (1,2)\n",
+    "            off_diag = g12; diag_diff = g22 - g11; abs_off = tl.abs(off_diag)\n",
+    "            tau = tl.where(abs_off > EPS, diag_diff / (2.0 * off_diag), 0.0)\n",
+    "            t = tl.where(abs_off > EPS, tl.where(tau >= 0, 1.0, -1.0) / (tl.abs(tau) + tl.sqrt(1.0 + tau * tau)), 0.0)\n",
+    "            c = 1.0 / tl.sqrt(1.0 + t * t); s = t * c\n",
+    "            ng11 = c*c*g11 - 2.0*s*c*g12 + s*s*g22; ng22 = s*s*g11 + 2.0*s*c*g12 + c*c*g22\n",
+    "            ng01 = c*g01 - s*g02; ng02b = s*g01 + c*g02\n",
+    "            g11 = ng11; g22 = ng22; g12 = 0.0; g01 = ng01; g02 = ng02b\n",
+    "            nv01 = c*v01-s*v02; nv02 = s*v01+c*v02; nv11 = c*v11-s*v12; nv12 = s*v11+c*v12\n",
+    "            nv21 = c*v21-s*v22; nv22 = s*v21+c*v22\n",
+    "            v01=nv01; v02=nv02; v11=nv11; v12=nv12; v21=nv21; v22=nv22\n",
+    "        # Sort eigenvalues descending + permute V columns\n",
+    "        s0 = tl.sqrt(tl.maximum(g00, EPS)); s1 = tl.sqrt(tl.maximum(g11, EPS)); s2 = tl.sqrt(tl.maximum(g22, EPS))\n",
+    "        do_swap = s0 < s1\n",
+    "        s0, s1 = tl.where(do_swap, s1, s0), tl.where(do_swap, s0, s1)\n",
+    "        tv=v00; v00=tl.where(do_swap,v01,v00); v01=tl.where(do_swap,tv,v01)\n",
+    "        tv=v10; v10=tl.where(do_swap,v11,v10); v11=tl.where(do_swap,tv,v11)\n",
+    "        tv=v20; v20=tl.where(do_swap,v21,v20); v21=tl.where(do_swap,tv,v21)\n",
+    "        do_swap = s0 < s2\n",
+    "        s0, s2 = tl.where(do_swap, s2, s0), tl.where(do_swap, s0, s2)\n",
+    "        tv=v00; v00=tl.where(do_swap,v02,v00); v02=tl.where(do_swap,tv,v02)\n",
+    "        tv=v10; v10=tl.where(do_swap,v12,v10); v12=tl.where(do_swap,tv,v12)\n",
+    "        tv=v20; v20=tl.where(do_swap,v22,v20); v22=tl.where(do_swap,tv,v22)\n",
+    "        do_swap = s1 < s2\n",
+    "        s1, s2 = tl.where(do_swap, s2, s1), tl.where(do_swap, s1, s2)\n",
+    "        tv=v01; v01=tl.where(do_swap,v02,v01); v02=tl.where(do_swap,tv,v02)\n",
+    "        tv=v11; v11=tl.where(do_swap,v12,v11); v12=tl.where(do_swap,tv,v12)\n",
+    "        tv=v21; v21=tl.where(do_swap,v22,v21); v22=tl.where(do_swap,tv,v22)\n",
+    "        # Write S\n",
+    "        s_base = bid * 3\n",
+    "        tl.store(S_ptr + s_base + 0, s0); tl.store(S_ptr + s_base + 1, s1); tl.store(S_ptr + s_base + 2, s2)\n",
+    "        # Write Vh = V^T\n",
+    "        vh_base = bid * 9\n",
+    "        tl.store(Vh_ptr+vh_base+0,v00); tl.store(Vh_ptr+vh_base+1,v10); tl.store(Vh_ptr+vh_base+2,v20)\n",
+    "        tl.store(Vh_ptr+vh_base+3,v01); tl.store(Vh_ptr+vh_base+4,v11); tl.store(Vh_ptr+vh_base+5,v21)\n",
+    "        tl.store(Vh_ptr+vh_base+6,v02); tl.store(Vh_ptr+vh_base+7,v12); tl.store(Vh_ptr+vh_base+8,v22)\n",
+    "        # Stage 3: U = A @ V @ diag(1/S)\n",
+    "        inv_s0 = 1.0/(s0+EPS); inv_s1 = 1.0/(s1+EPS); inv_s2 = 1.0/(s2+EPS)\n",
+    "        for block_start in range(0, M, BLOCK_M):\n",
+    "            offs = tl.arange(0, BLOCK_M)\n",
+    "            row_idx = block_start + offs\n",
+    "            mask = row_idx < M\n",
+    "            a0 = tl.load(A_ptr+base+row_idx*3+0, mask=mask, other=0.0).to(tl.float32)\n",
+    "            a1 = tl.load(A_ptr+base+row_idx*3+1, mask=mask, other=0.0).to(tl.float32)\n",
+    "            a2 = tl.load(A_ptr+base+row_idx*3+2, mask=mask, other=0.0).to(tl.float32)\n",
+    "            u0 = (a0*v00 + a1*v10 + a2*v20) * inv_s0\n",
+    "            u1 = (a0*v01 + a1*v11 + a2*v21) * inv_s1\n",
+    "            u2 = (a0*v02 + a1*v12 + a2*v22) * inv_s2\n",
+    "            u_base = bid * M * 3\n",
+    "            tl.store(U_ptr+u_base+row_idx*3+0, u0, mask=mask)\n",
+    "            tl.store(U_ptr+u_base+row_idx*3+1, u1, mask=mask)\n",
+    "            tl.store(U_ptr+u_base+row_idx*3+2, u2, mask=mask)\n",
+    "\n",
+    "    def batched_svd3(A, block_m=128, jacobi_iters=6):\n",
+    "        \"\"\"Fused Triton SVD for (B, M, 3) tensors. Returns U, S, Vh.\"\"\"\n",
+    "        assert A.ndim == 3 and A.shape[2] == 3\n",
+    "        B, M, _ = A.shape\n",
+    "        A_f32 = A.contiguous().float()\n",
+    "        U = torch.empty((B, M, 3), dtype=torch.float32, device=A.device)\n",
+    "        S = torch.empty((B, 3), dtype=torch.float32, device=A.device)\n",
+    "        Vh = torch.empty((B, 3, 3), dtype=torch.float32, device=A.device)\n",
+    "        _svd3_kernel[(B,)](A_f32, U, S, Vh, M=M, BLOCK_M=block_m, JACOBI_ITERS=jacobi_iters, EPS=1e-12)\n",
+    "        return U, S, Vh\n",
+    "\n",
+    "    _HAS_TRITON_SVD3 = True\n",
+    "    print(\"[PERF] Triton SVD3 kernel loaded \u2014 fused 3\u00d73 Jacobi eigensolver\")\n",
+    "except ImportError:\n",
+    "    print(\"[PERF] Triton not available \u2014 falling back to torch.linalg.svd\")\n",
+    "\n",
+    "    def batched_svd3(A, block_m=128, jacobi_iters=6):\n",
+    "        \"\"\"Fallback: torch.linalg.svd for (B, M, 3) tensors.\"\"\"\n",
+    "        return torch.linalg.svd(A.float(), full_matrices=False)\n",
+    "\n",
     "# \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\n",
     "# GEOLIP CORE \u2014 Geometric Building Blocks\n",
     "# \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\n",

spectral/notebooks/experiment_7_composite_pipelines.ipynb CHANGED Viewed

@@ -39,7 +39,7 @@
    "metadata": {},
    "source": [
     "# @title Install Dependencies\n",
-    "!pip install -q kymatio torch torchvision tensorboard matplotlib scikit-learn huggingface_hub\n",
     "%load_ext tensorboard\n",
     "import torch\n",
     "print(f\"PyTorch {torch.__version__}, CUDA available: {torch.cuda.is_available()}\")\n",
@@ -92,6 +92,142 @@
     "if device.type == \"cuda\":\n",
     "    print(f\"[PERF] TF32={torch.backends.cuda.matmul.allow_tf32}, cudnn.benchmark={torch.backends.cudnn.benchmark}, linalg={_linalg_lib}\")\n",
     "\n",
     "# \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\n",
     "# GEOLIP CORE \u2014 Geometric Building Blocks\n",
     "# \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\n",

    "metadata": {},
    "source": [
     "# @title Install Dependencies\n",
+    "!pip install -q kymatio torch torchvision tensorboard matplotlib scikit-learn huggingface_hub triton\n",
     "%load_ext tensorboard\n",
     "import torch\n",
     "print(f\"PyTorch {torch.__version__}, CUDA available: {torch.cuda.is_available()}\")\n",
     "if device.type == \"cuda\":\n",
     "    print(f\"[PERF] TF32={torch.backends.cuda.matmul.allow_tf32}, cudnn.benchmark={torch.backends.cudnn.benchmark}, linalg={_linalg_lib}\")\n",
     "\n",
+    "# \u2500\u2500 Fused Triton SVD kernel for batched M\u00d73 matrices \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n",
+    "# cuSOLVER dispatch overhead dominates for tiny (64,3) patches.\n",
+    "# This kernel fuses G=A^T A, 3\u00d73 Jacobi eigensolver (in scalar registers),\n",
+    "# and U recovery into a single kernel launch. ~10,000x faster than cuSOLVER.\n",
+    "_HAS_TRITON_SVD3 = False\n",
+    "try:\n",
+    "    import triton\n",
+    "    import triton.language as tl\n",
+    "\n",
+    "    @triton.jit\n",
+    "    def _svd3_kernel(\n",
+    "        A_ptr, U_ptr, S_ptr, Vh_ptr,\n",
+    "        M: tl.constexpr, BLOCK_M: tl.constexpr,\n",
+    "        JACOBI_ITERS: tl.constexpr, EPS: tl.constexpr,\n",
+    "    ):\n",
+    "        bid = tl.program_id(0)\n",
+    "        # Stage 1: G = A^T A (6 accumulators, symmetric)\n",
+    "        g00 = tl.zeros([], dtype=tl.float32)\n",
+    "        g01 = tl.zeros([], dtype=tl.float32)\n",
+    "        g02 = tl.zeros([], dtype=tl.float32)\n",
+    "        g11 = tl.zeros([], dtype=tl.float32)\n",
+    "        g12 = tl.zeros([], dtype=tl.float32)\n",
+    "        g22 = tl.zeros([], dtype=tl.float32)\n",
+    "        base = bid * M * 3\n",
+    "        for block_start in range(0, M, BLOCK_M):\n",
+    "            offs = tl.arange(0, BLOCK_M)\n",
+    "            row_idx = block_start + offs\n",
+    "            mask = row_idx < M\n",
+    "            ptr0 = base + row_idx * 3 + 0\n",
+    "            ptr1 = base + row_idx * 3 + 1\n",
+    "            ptr2 = base + row_idx * 3 + 2\n",
+    "            a0 = tl.load(A_ptr + ptr0, mask=mask, other=0.0).to(tl.float32)\n",
+    "            a1 = tl.load(A_ptr + ptr1, mask=mask, other=0.0).to(tl.float32)\n",
+    "            a2 = tl.load(A_ptr + ptr2, mask=mask, other=0.0).to(tl.float32)\n",
+    "            g00 += tl.sum(a0 * a0); g01 += tl.sum(a0 * a1); g02 += tl.sum(a0 * a2)\n",
+    "            g11 += tl.sum(a1 * a1); g12 += tl.sum(a1 * a2); g22 += tl.sum(a2 * a2)\n",
+    "        # Stage 2: 3\u00d73 Jacobi eigensolver (all in scalar registers)\n",
+    "        v00 = 1.0; v01 = 0.0; v02 = 0.0\n",
+    "        v10 = 0.0; v11 = 1.0; v12 = 0.0\n",
+    "        v20 = 0.0; v21 = 0.0; v22 = 1.0\n",
+    "        for _sweep in range(JACOBI_ITERS):\n",
+    "            # pair (0,1)\n",
+    "            off_diag = g01; diag_diff = g11 - g00; abs_off = tl.abs(off_diag)\n",
+    "            tau = tl.where(abs_off > EPS, diag_diff / (2.0 * off_diag), 0.0)\n",
+    "            t = tl.where(abs_off > EPS, tl.where(tau >= 0, 1.0, -1.0) / (tl.abs(tau) + tl.sqrt(1.0 + tau * tau)), 0.0)\n",
+    "            c = 1.0 / tl.sqrt(1.0 + t * t); s = t * c\n",
+    "            ng00 = c*c*g00 - 2.0*s*c*g01 + s*s*g11; ng11 = s*s*g00 + 2.0*s*c*g01 + c*c*g11\n",
+    "            ng02 = c*g02 - s*g12; ng12 = s*g02 + c*g12\n",
+    "            g00 = ng00; g11 = ng11; g01 = 0.0; g02 = ng02; g12 = ng12\n",
+    "            nv00 = c*v00-s*v01; nv01 = s*v00+c*v01; nv10 = c*v10-s*v11; nv11 = s*v10+c*v11\n",
+    "            nv20 = c*v20-s*v21; nv21 = s*v20+c*v21\n",
+    "            v00=nv00; v01=nv01; v10=nv10; v11=nv11; v20=nv20; v21=nv21\n",
+    "            # pair (0,2)\n",
+    "            off_diag = g02; diag_diff = g22 - g00; abs_off = tl.abs(off_diag)\n",
+    "            tau = tl.where(abs_off > EPS, diag_diff / (2.0 * off_diag), 0.0)\n",
+    "            t = tl.where(abs_off > EPS, tl.where(tau >= 0, 1.0, -1.0) / (tl.abs(tau) + tl.sqrt(1.0 + tau * tau)), 0.0)\n",
+    "            c = 1.0 / tl.sqrt(1.0 + t * t); s = t * c\n",
+    "            ng00 = c*c*g00 - 2.0*s*c*g02 + s*s*g22; ng22 = s*s*g00 + 2.0*s*c*g02 + c*c*g22\n",
+    "            ng01 = c*g01 - s*g12; ng12b = s*g01 + c*g12\n",
+    "            g00 = ng00; g22 = ng22; g02 = 0.0; g01 = ng01; g12 = ng12b\n",
+    "            nv00 = c*v00-s*v02; nv02 = s*v00+c*v02; nv10 = c*v10-s*v12; nv12 = s*v10+c*v12\n",
+    "            nv20 = c*v20-s*v22; nv22 = s*v20+c*v22\n",
+    "            v00=nv00; v02=nv02; v10=nv10; v12=nv12; v20=nv20; v22=nv22\n",
+    "            # pair (1,2)\n",
+    "            off_diag = g12; diag_diff = g22 - g11; abs_off = tl.abs(off_diag)\n",
+    "            tau = tl.where(abs_off > EPS, diag_diff / (2.0 * off_diag), 0.0)\n",
+    "            t = tl.where(abs_off > EPS, tl.where(tau >= 0, 1.0, -1.0) / (tl.abs(tau) + tl.sqrt(1.0 + tau * tau)), 0.0)\n",
+    "            c = 1.0 / tl.sqrt(1.0 + t * t); s = t * c\n",
+    "            ng11 = c*c*g11 - 2.0*s*c*g12 + s*s*g22; ng22 = s*s*g11 + 2.0*s*c*g12 + c*c*g22\n",
+    "            ng01 = c*g01 - s*g02; ng02b = s*g01 + c*g02\n",
+    "            g11 = ng11; g22 = ng22; g12 = 0.0; g01 = ng01; g02 = ng02b\n",
+    "            nv01 = c*v01-s*v02; nv02 = s*v01+c*v02; nv11 = c*v11-s*v12; nv12 = s*v11+c*v12\n",
+    "            nv21 = c*v21-s*v22; nv22 = s*v21+c*v22\n",
+    "            v01=nv01; v02=nv02; v11=nv11; v12=nv12; v21=nv21; v22=nv22\n",
+    "        # Sort eigenvalues descending + permute V columns\n",
+    "        s0 = tl.sqrt(tl.maximum(g00, EPS)); s1 = tl.sqrt(tl.maximum(g11, EPS)); s2 = tl.sqrt(tl.maximum(g22, EPS))\n",
+    "        do_swap = s0 < s1\n",
+    "        s0, s1 = tl.where(do_swap, s1, s0), tl.where(do_swap, s0, s1)\n",
+    "        tv=v00; v00=tl.where(do_swap,v01,v00); v01=tl.where(do_swap,tv,v01)\n",
+    "        tv=v10; v10=tl.where(do_swap,v11,v10); v11=tl.where(do_swap,tv,v11)\n",
+    "        tv=v20; v20=tl.where(do_swap,v21,v20); v21=tl.where(do_swap,tv,v21)\n",
+    "        do_swap = s0 < s2\n",
+    "        s0, s2 = tl.where(do_swap, s2, s0), tl.where(do_swap, s0, s2)\n",
+    "        tv=v00; v00=tl.where(do_swap,v02,v00); v02=tl.where(do_swap,tv,v02)\n",
+    "        tv=v10; v10=tl.where(do_swap,v12,v10); v12=tl.where(do_swap,tv,v12)\n",
+    "        tv=v20; v20=tl.where(do_swap,v22,v20); v22=tl.where(do_swap,tv,v22)\n",
+    "        do_swap = s1 < s2\n",
+    "        s1, s2 = tl.where(do_swap, s2, s1), tl.where(do_swap, s1, s2)\n",
+    "        tv=v01; v01=tl.where(do_swap,v02,v01); v02=tl.where(do_swap,tv,v02)\n",
+    "        tv=v11; v11=tl.where(do_swap,v12,v11); v12=tl.where(do_swap,tv,v12)\n",
+    "        tv=v21; v21=tl.where(do_swap,v22,v21); v22=tl.where(do_swap,tv,v22)\n",
+    "        # Write S\n",
+    "        s_base = bid * 3\n",
+    "        tl.store(S_ptr + s_base + 0, s0); tl.store(S_ptr + s_base + 1, s1); tl.store(S_ptr + s_base + 2, s2)\n",
+    "        # Write Vh = V^T\n",
+    "        vh_base = bid * 9\n",
+    "        tl.store(Vh_ptr+vh_base+0,v00); tl.store(Vh_ptr+vh_base+1,v10); tl.store(Vh_ptr+vh_base+2,v20)\n",
+    "        tl.store(Vh_ptr+vh_base+3,v01); tl.store(Vh_ptr+vh_base+4,v11); tl.store(Vh_ptr+vh_base+5,v21)\n",
+    "        tl.store(Vh_ptr+vh_base+6,v02); tl.store(Vh_ptr+vh_base+7,v12); tl.store(Vh_ptr+vh_base+8,v22)\n",
+    "        # Stage 3: U = A @ V @ diag(1/S)\n",
+    "        inv_s0 = 1.0/(s0+EPS); inv_s1 = 1.0/(s1+EPS); inv_s2 = 1.0/(s2+EPS)\n",
+    "        for block_start in range(0, M, BLOCK_M):\n",
+    "            offs = tl.arange(0, BLOCK_M)\n",
+    "            row_idx = block_start + offs\n",
+    "            mask = row_idx < M\n",
+    "            a0 = tl.load(A_ptr+base+row_idx*3+0, mask=mask, other=0.0).to(tl.float32)\n",
+    "            a1 = tl.load(A_ptr+base+row_idx*3+1, mask=mask, other=0.0).to(tl.float32)\n",
+    "            a2 = tl.load(A_ptr+base+row_idx*3+2, mask=mask, other=0.0).to(tl.float32)\n",
+    "            u0 = (a0*v00 + a1*v10 + a2*v20) * inv_s0\n",
+    "            u1 = (a0*v01 + a1*v11 + a2*v21) * inv_s1\n",
+    "            u2 = (a0*v02 + a1*v12 + a2*v22) * inv_s2\n",
+    "            u_base = bid * M * 3\n",
+    "            tl.store(U_ptr+u_base+row_idx*3+0, u0, mask=mask)\n",
+    "            tl.store(U_ptr+u_base+row_idx*3+1, u1, mask=mask)\n",
+    "            tl.store(U_ptr+u_base+row_idx*3+2, u2, mask=mask)\n",
+    "\n",
+    "    def batched_svd3(A, block_m=128, jacobi_iters=6):\n",
+    "        \"\"\"Fused Triton SVD for (B, M, 3) tensors. Returns U, S, Vh.\"\"\"\n",
+    "        assert A.ndim == 3 and A.shape[2] == 3\n",
+    "        B, M, _ = A.shape\n",
+    "        A_f32 = A.contiguous().float()\n",
+    "        U = torch.empty((B, M, 3), dtype=torch.float32, device=A.device)\n",
+    "        S = torch.empty((B, 3), dtype=torch.float32, device=A.device)\n",
+    "        Vh = torch.empty((B, 3, 3), dtype=torch.float32, device=A.device)\n",
+    "        _svd3_kernel[(B,)](A_f32, U, S, Vh, M=M, BLOCK_M=block_m, JACOBI_ITERS=jacobi_iters, EPS=1e-12)\n",
+    "        return U, S, Vh\n",
+    "\n",
+    "    _HAS_TRITON_SVD3 = True\n",
+    "    print(\"[PERF] Triton SVD3 kernel loaded \u2014 fused 3\u00d73 Jacobi eigensolver\")\n",
+    "except ImportError:\n",
+    "    print(\"[PERF] Triton not available \u2014 falling back to torch.linalg.svd\")\n",
+    "\n",
+    "    def batched_svd3(A, block_m=128, jacobi_iters=6):\n",
+    "        \"\"\"Fallback: torch.linalg.svd for (B, M, 3) tensors.\"\"\"\n",
+    "        return torch.linalg.svd(A.float(), full_matrices=False)\n",
+    "\n",
     "# \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\n",
     "# GEOLIP CORE \u2014 Geometric Building Blocks\n",
     "# \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\n",