AbstractPhil
/

geolip-hypersphere-experiments

TensorBoard

Model card Files Files and versions

xet

Metrics Training metrics Community

AbstractPhil commited on 4 days ago

Commit

cc4f091

verified ·

1 Parent(s): fd243cb

Create constellation.py

Browse files

Files changed (1) hide show

constellation.py +476 -0

constellation.py ADDED Viewed

	@@ -0,0 +1,476 @@

+"""
+Constellation — Unified Geometric Observer + Interpreter
+==========================================================
+Configurable implementation covering all validated constellation forms.
+PROVEN RESULTS:
+  Form 1 (Core):   91.5% CIFAR-10 @ 1.6M params, CV=0.2045
+  Form 5 (Relay):  cos_to_orig=0.994 @ depth 16, 8.4× faster than attn @ 131K
+  Hybrid:          88.0% CIFAR-10 @ 23.5M (conv encoder + constellation)
+  Scattering v1:   81.9% CIFAR-10 @ 17M (frozen scattering + constellation)
+UNIVERSAL RULES (empirically validated):
+  - SquaredReLU in all constellation paths, never GELU
+  - Patchwork: Linear(in, in*2) → SquaredReLU → LN → Linear(in*2, out)
+  - Gate init: -3.0 (sigmoid ≈ 0.047) for relay/residual forms
+  - SLERP: acos in fp32, everything else in compute dtype
+  - Adam, NO weight decay — geometry IS regularization
+  - InfoNCE is alignment FORCE, Procrustes is REGULARIZER
+  - CV loss on the BOTTLENECK, weight 0.001 or below
+  - Anchor dropout (30%) prevents collapse in high-anchor configs
+FORMS:
+  Constellation       — observation + interpretation, configurable
+  ConstellationRelay  — per-token geometric layer with gated residual
+Usage:
+    from constellation import Constellation, ConstellationRelay
+    # Form 1 (Core): single vector per image
+    c = Constellation(n_anchors=16, dim=16, n_directions=8,
+                      d_comp=64, n_phases=3)
+    output = c(directions)  # (B, 8, 16) → ConstellationOutput
+    # Form 5 (Relay): per-token processing
+    r = ConstellationRelay(dim=256, patch_dim=16, n_anchors=16)
+    out = r(tokens)  # (B, S, 256) → (B, S, 256)
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+from dataclasses import dataclass
+from typing import Optional
+# ══════════════════════════════════════════════════════════════════
+# ACTIVATION
+# ══════════════════════════════════════════════════════════════════
+class SquaredReLU(nn.Module):
+    """x → ReLU(x)². Proven superior to GELU in all constellation paths."""
+    def forward(self, x):
+        return F.relu(x) ** 2
+# ══════════════════════════════════════════════════════════════════
+# ANCHOR INITIALIZATION
+# ══════════════════════════════════════════════════════════════════
+def init_anchors_xavier(n, d):
+    """Xavier normal → normalize. Near-orthogonal in high-d. Used in Core."""
+    w = torch.empty(n, d)
+    nn.init.xavier_normal_(w)
+    return F.normalize(w, dim=-1)
+def init_anchors_orthogonal(n, d):
+    """QR decomposition → exact orthonormal basis. Used when n <= d."""
+    if n <= d:
+        M = torch.randn(d, n)
+        Q, _ = torch.linalg.qr(M)
+        return Q.T.contiguous()
+    else:
+        M = torch.randn(d, d)
+        Q, _ = torch.linalg.qr(M)
+        basis = Q.T
+        extra = F.normalize(torch.randn(n - d, d), dim=-1)
+        return torch.cat([basis, extra], dim=0)
+def init_anchors_repulsion(n, d, iters=200, lr=0.05):
+    """QR + iterative repulsion for even coverage beyond d anchors."""
+    vecs = init_anchors_orthogonal(n, d)
+    vecs = F.normalize(vecs, dim=-1)
+    for _ in range(iters):
+        sim = vecs @ vecs.T
+        sim.fill_diagonal_(-2.0)
+        nn_idx = sim.argmax(dim=1)
+        vecs = F.normalize(vecs - lr * vecs[nn_idx], dim=-1)
+    return vecs
+INIT_METHODS = {
+    'xavier': init_anchors_xavier,
+    'orthogonal': init_anchors_orthogonal,
+    'repulsion': init_anchors_repulsion,
+}
+# ══════════════════════════════════════════════════════════════════
+# OUTPUT
+# ══════════════════════════════════════════════════════════════════
+@dataclass
+class ConstellationOutput:
+    """Full output from constellation forward pass."""
+    embedding: torch.Tensor       # (B, pw_dim) — interpreted observation
+    cosines: torch.Tensor         # (B, N, A) or (B, N, A*phases)
+    distances: torch.Tensor       # (B, N, A) or (B, N, A*phases)
+    nearest: torch.Tensor         # (B, N) — collapsed anchor assignment
+    directions: torch.Tensor      # (B, N, D) — input directions on S^(D-1)
+    tri_flat: torch.Tensor        # (B, tri_dim) — flattened triangulation
+# ════════════════════════════���═════════════════════════════════════
+# CONSTELLATION — observation + interpretation
+# ══════════════════════════════════════════════════════════════════
+class Constellation(nn.Module):
+    """Geometric observer with anchor-aligned interpretation.
+    Anchors on S^(D-1) observe input directions via triangulation.
+    Compartments interpret per-anchor observations.
+    SLERP phases provide multi-scale angular measurement.
+    All coupled through gradient flow.
+    Args:
+        n_anchors: reference directions on S^(D-1)
+        dim: anchor/direction dimensionality
+        n_directions: input directions per sample
+        d_comp: hidden dim per compartment
+        n_phases: SLERP interpolation phases (1=static, 3=proven default)
+        anchor_init: 'xavier', 'orthogonal', or 'repulsion'
+        anchor_dropout: fraction of anchors to drop during training (0.3 for soup)
+        compartment: 'aligned' (one per anchor) or 'flat' (single patchwork)
+    """
+    def __init__(
+        self,
+        n_anchors: int,
+        dim: int,
+        n_directions: int,
+        d_comp: int = 64,
+        n_phases: int = 3,
+        anchor_init: str = 'xavier',
+        anchor_dropout: float = 0.0,
+        compartment: str = 'aligned',
+    ):
+        super().__init__()
+        self.n_anchors = n_anchors
+        self.dim = dim
+        self.n_directions = n_directions
+        self.d_comp = d_comp
+        self.n_phases = n_phases
+        self.anchor_dropout = anchor_dropout
+        self.compartment_type = compartment
+        # Anchors: home (frozen) + current (learned)
+        init_fn = INIT_METHODS[anchor_init]
+        home = init_fn(n_anchors, dim)
+        self.register_buffer('home', home)
+        self.anchors = nn.Parameter(home.clone())
+        # Triangulation dimensions
+        if compartment == 'aligned':
+            # tri: (B, N, A * phases) → each compartment reads its anchor's column
+            self.tri_dim = n_directions * n_anchors * n_phases
+            self.embedding_dim = n_anchors * d_comp
+            # One compartment per anchor — reads tri[:, :, k] across all phases
+            # Input: n_directions * n_phases values per anchor
+            comp_in = n_directions * n_phases
+            self.compartments = nn.ModuleList([
+                nn.Sequential(
+                    nn.Linear(comp_in, d_comp * 2),
+                    SquaredReLU(),
+                    nn.Linear(d_comp * 2, d_comp),
+                    nn.LayerNorm(d_comp),
+                ) for _ in range(n_anchors)
+            ])
+        elif compartment == 'flat':
+            # tri: (B, tri_dim) → single patchwork MLP
+            self.tri_dim = n_directions * n_anchors * n_phases
+            self.embedding_dim = dim
+            self.patchwork = nn.Sequential(
+                nn.Linear(self.tri_dim, self.tri_dim * 2),
+                SquaredReLU(),
+                nn.LayerNorm(self.tri_dim * 2),
+                nn.Linear(self.tri_dim * 2, dim),
+            )
+        else:
+            raise ValueError(f"Unknown compartment type: {compartment}")
+        self._init_weights()
+    def _init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.trunc_normal_(m.weight, std=0.02)
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.LayerNorm):
+                nn.init.ones_(m.weight)
+                nn.init.zeros_(m.bias)
+    def drift(self):
+        """Geodesic distance between home and learned anchor positions."""
+        h = F.normalize(self.home.float(), dim=-1)
+        c = F.normalize(self.anchors.float(), dim=-1)
+        return torch.acos((h * c).sum(-1).clamp(-1 + 1e-6, 1 - 1e-6))
+    def at_phase(self, t):
+        """SLERP between home and learned positions at phase t ∈ [0, 1]."""
+        h = F.normalize(self.home.float(), dim=-1)
+        c = F.normalize(self.anchors.float(), dim=-1)
+        omega = self.drift().unsqueeze(-1)  # (A, 1)
+        so = omega.sin().clamp(min=1e-6)
+        return torch.sin((1 - t) * omega) / so * h + torch.sin(t * omega) / so * c
+    def _triangulate(self, directions, anchors):
+        """(B, N, D) × (A, D) → (B, N, A) cosines and distances."""
+        cos = torch.einsum('bnd,ad->bna', directions, anchors)
+        return cos, 1.0 - cos
+    def forward(self, directions: torch.Tensor) -> ConstellationOutput:
+        """Observe and interpret.
+        Args:
+            directions: (B, N, D) — L2-normalized to S^(D-1)
+        Returns:
+            ConstellationOutput
+        """
+        B, N, D = directions.shape
+        # Multi-phase triangulation
+        phases = torch.linspace(0, 1, self.n_phases, device=directions.device).tolist()
+        all_cos = []
+        all_dist = []
+        for t in phases:
+            anchors_t = F.normalize(self.at_phase(t), dim=-1).to(directions.dtype)
+            # Anchor dropout during training
+            if self.training and self.anchor_dropout > 0:
+                mask = torch.rand(anchors_t.shape[0], device=anchors_t.device) > self.anchor_dropout
+                if mask.sum() < 2:
+                    mask[:2] = True
+                anchors_t = anchors_t[mask]
+            cos, dist = self._triangulate(directions, anchors_t)
+            all_cos.append(cos)
+            all_dist.append(dist)
+        # Stack phases: (B, N, A*phases) if no dropout, variable if dropout
+        cos_cat = torch.cat(all_cos, dim=-1)
+        dist_cat = torch.cat(all_dist, dim=-1)
+        # Nearest anchor (from phase 0, no dropout)
+        anchors_0 = F.normalize(self.at_phase(0.0), dim=-1).to(directions.dtype)
+        cos_0 = torch.einsum('bnd,ad->bna', directions, anchors_0)
+        nearest = cos_0.max(dim=-1).indices
+        # Interpret
+        if self.compartment_type == 'aligned' and not (self.training and self.anchor_dropout > 0):
+            # dist_cat: (B, N, A * n_phases)
+            # Reshape to (B, N, n_phases, A) then (B, A, N * n_phases)
+            A = self.n_anchors
+            dist_reshape = dist_cat.reshape(B, N, self.n_phases, A)
+            # For compartment k: gather distances to anchor k across all directions and phases
+            # dist_reshape[:, :, :, k] → (B, N, n_phases) → flatten → (B, N*n_phases)
+            parts = []
+            for k in range(A):
+                comp_input = dist_reshape[:, :, :, k].reshape(B, N * self.n_phases)
+                parts.append(self.compartments[k](comp_input))
+            embedding = torch.cat(parts, dim=-1)  # (B, A * d_comp)
+        elif self.compartment_type == 'flat' or (self.training and self.anchor_dropout > 0):
+            tri_flat = dist_cat.reshape(B, -1)
+            if self.compartment_type == 'flat':
+                embedding = self.patchwork(tri_flat)
+            else:
+                # Fallback for aligned + dropout: pad and use compartments
+                # This is a training-only path
+                embedding = torch.zeros(B, self.embedding_dim,
+                                        device=directions.device, dtype=directions.dtype)
+                # Use flat mean as fallback during dropout
+                for k in range(self.n_anchors):
+                    comp_in_size = self.n_directions * self.n_phases
+                    if tri_flat.shape[1] >= comp_in_size:
+                        chunk = tri_flat[:, :comp_in_size]
+                    else:
+                        chunk = F.pad(tri_flat, (0, comp_in_size - tri_flat.shape[1]))
+                    embedding[:, k * self.d_comp:(k + 1) * self.d_comp] = self.compartments[k](chunk)
+        else:
+            tri_flat = dist_cat.reshape(B, -1)
+            embedding = self.patchwork(tri_flat)
+        tri_flat = dist_cat.reshape(B, -1)
+        return ConstellationOutput(
+            embedding=embedding,
+            cosines=cos_cat,
+            distances=dist_cat,
+            nearest=nearest,
+            directions=directions,
+            tri_flat=tri_flat,
+        )
+# ══════════════════════════════════════════════════════════════════
+# CONSTELLATION RELAY — Form 5 (per-token geometric layer)
+# ══════════════════════════════════════════════════════════════════
+class ConstellationRelay(nn.Module):
+    """Per-token geometric processing layer with gated residual.
+    Replaces attention as a per-token processing layer.
+    O(S) complexity. No cross-token interaction.
+    Preserves 99.4% cosine similarity to input at depth 16.
+    Pipeline:
+      LayerNorm → chunk D into patches → L2 norm per patch
+      → Constellation observation + interpretation
+      → Project back to D → gated residual
+    Args:
+        dim: token dimension (must be divisible by patch_dim)
+        patch_dim: dimension per patch subspace (default 16)
+        n_anchors: anchors per patch subspace
+        d_comp: hidden dim per compartment
+        n_phases: SLERP phases
+        gate_init: initial gate bias (default -3.0 → sigmoid ≈ 0.047)
+        anchor_init: initialization method
+    """
+    def __init__(
+        self,
+        dim: int,
+        patch_dim: int = 16,
+        n_anchors: int = 16,
+        d_comp: int = 64,
+        n_phases: int = 3,
+        gate_init: float = -3.0,
+        anchor_init: str = 'xavier',
+    ):
+        super().__init__()
+        assert dim % patch_dim == 0
+        self.dim = dim
+        self.patch_dim = patch_dim
+        self.n_patches = dim // patch_dim
+        self.norm = nn.LayerNorm(dim)
+        # Constellation operates on (B*S, n_patches, patch_dim)
+        self.constellation = Constellation(
+            n_anchors=n_anchors,
+            dim=patch_dim,
+            n_directions=self.n_patches,
+            d_comp=d_comp,
+            n_phases=n_phases,
+            anchor_init=anchor_init,
+            compartment='aligned',
+        )
+        # Project constellation embedding back to token dim
+        self.proj = nn.Linear(self.constellation.embedding_dim, dim)
+        # Gated residual — init at -3.0 so gate starts near 0
+        self.gate = nn.Parameter(torch.full((dim,), gate_init))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        x: (B, S, D) or (B, D)
+        Returns: same shape as input
+        """
+        squeeze = False
+        if x.dim() == 2:
+            x = x.unsqueeze(1)
+            squeeze = True
+        B, S, D = x.shape
+        residual = x
+        # Normalize
+        h = self.norm(x)
+        # Chunk into patches and normalize to S^(patch_dim-1)
+        h_flat = h.reshape(B * S, self.n_patches, self.patch_dim)
+        h_flat = F.normalize(h_flat, dim=-1)
+        # Constellation: observe + interpret
+        output = self.constellation(h_flat)
+        # Project back to token dim
+        update = self.proj(output.embedding)  # (B*S, D)
+        update = update.reshape(B, S, D)
+        # Gated residual
+        g = torch.sigmoid(self.gate)
+        out = residual + g * update
+        if squeeze:
+            out = out.squeeze(1)
+        return out
+# ══════════════════════════════════════════════════════════════════
+# GEOMETRIC OPS — measurement tools
+# ══════════════════════════════════════════════════════════════════
+class GeometricOps:
+    """Static geometric utilities for constellation monitoring and loss."""
+    @staticmethod
+    def cayley_menger_vol2(points):
+        """Squared simplex volume. points: (B, N, D) → (B,)."""
+        B, N, D = points.shape
+        gram = torch.bmm(points, points.transpose(1, 2))
+        norms = torch.diagonal(gram, dim1=1, dim2=2)
+        d2 = norms.unsqueeze(2) + norms.unsqueeze(1) - 2 * gram
+        d2 = F.relu(d2)
+        cm = torch.zeros(B, N + 1, N + 1, device=points.device, dtype=points.dtype)
+        cm[:, 0, 1:] = 1; cm[:, 1:, 0] = 1; cm[:, 1:, 1:] = d2
+        k = N - 1
+        sign = (-1.0) ** (k + 1)
+        fact = math.factorial(k)
+        return sign * torch.linalg.det(cm.float()).to(points.dtype) / ((2 ** k) * (fact ** 2))
+    @staticmethod
+    def cv_metric(emb, n_samples=200, n_points=5):
+        """Non-differentiable CV for monitoring. Target band: 0.20–0.23."""
+        vols = []
+        for _ in range(n_samples):
+            idx = torch.randperm(emb.shape[0])[:n_points]
+            v2 = GeometricOps.cayley_menger_vol2(emb[idx].unsqueeze(0))
+            if v2[0] > 1e-20:
+                vols.append(v2[0].sqrt())
+        if len(vols) < 10:
+            return 0.0
+        vols_t = torch.stack(vols)
+        return (vols_t.std() / (vols_t.mean() + 1e-8)).item()
+    @staticmethod
+    def cv_loss(emb, target=0.22, n_samples=100, n_points=5):
+        """Differentiable CV loss. Weight: 0.001 or below."""
+        vols = []
+        for _ in range(n_samples):
+            idx = torch.randperm(min(emb.shape[0], 512))[:n_points]
+            v2 = GeometricOps.cayley_menger_vol2(emb[idx].unsqueeze(0))
+            if v2[0] > 1e-20:
+                vols.append(v2[0].sqrt())
+        if len(vols) < 5:
+            return torch.tensor(0.0, device=emb.device)
+        vols_t = torch.stack(vols)
+        cv = vols_t.std() / (vols_t.mean() + 1e-8)
+        return (cv - target).pow(2)
+    @staticmethod
+    def anchor_spread_loss(anchors, target_cos=0.0):
+        """Repulsion loss keeping anchors spread on the sphere."""
+        a = F.normalize(anchors, dim=-1)
+        sim = a @ a.T
+        mask = ~torch.eye(a.shape[0], dtype=torch.bool, device=a.device)
+        return F.relu(sim[mask] - target_cos).mean()
+    @staticmethod
+    def diagnostics(output: ConstellationOutput, n_anchors: int) -> dict:
+        """Compute diagnostic metrics."""
+        diag = {}
+        diag['n_active'] = output.nearest.flatten().unique().numel()
+        counts = torch.bincount(output.nearest.flatten(), minlength=n_anchors).float()
+        diag['anchor_util_std'] = counts.std().item()
+        diag['nearest_cos'] = output.cosines[:, :, :n_anchors].max(dim=-1).values.mean().item()
+        diag['mean_tri'] = output.distances.mean().item()
+        return diag