AbstractPhil
/

penta-vit-experiments

Zero-Shot Classification

TensorBoard

Model card Files Files and versions

xet

Metrics Training metrics Community

AbstractPhil commited on Sep 11, 2025

Commit

7465a5c

verified ·

1 Parent(s): fa4e774

Added theta experimental head

Browse files

Files changed (1) hide show

vit_zana_v3.py +214 -101

vit_zana_v3.py CHANGED Viewed

@@ -1,7 +1,6 @@
 """
 Baseline Vision Transformer with Frozen Pentachora Embeddings
-Clean architecture with geometric semantic anchors
-Assumes PentachoronStabilizer is loaded externally
 """
 import torch
@@ -16,17 +15,16 @@ from typing import Optional, Tuple, Dict, Any
 class PentachoraEmbedding(nn.Module):
     """
     A single frozen pentachora embedding (5 vertices in geometric space).
-    Accepts pre-computed vertices only. No random initialization.
     """
     def __init__(self, vertices: torch.Tensor):
         super().__init__()
-        #assert vertices.shape == (5, 128), f"Expected shape (5, 128), got {vertices.shape}"
         self.embed_dim = vertices.shape[-1]
         # Store provided vertices as frozen buffer
-        self.register_buffer('vertices', vertices)
         self.vertices.requires_grad = False
         # Precompute normalized versions and centroid
@@ -34,26 +32,27 @@ class PentachoraEmbedding(nn.Module):
             self.register_buffer('vertices_norm', F.normalize(self.vertices, dim=-1))
             self.register_buffer('centroid', self.vertices.mean(dim=0))
             self.register_buffer('centroid_norm', F.normalize(self.centroid, dim=-1))
     def get_vertices(self) -> torch.Tensor:
-        """Get all 5 vertices."""
         return self.vertices
     def get_centroid(self) -> torch.Tensor:
-        """Get the centroid of the pentachora."""
         return self.centroid
     def compute_rose_score(self, features: torch.Tensor) -> torch.Tensor:
-        """
-        Compute Rose similarity score with this pentachora.
-        Uses external PentachoronStabilizer.rose_score_magnitude
-        """
-        # Prepare vertices for rose scoring
-        verts = self.vertices.unsqueeze(0)  # [1, 5, D]
         if features.dim() == 1:
             features = features.unsqueeze(0)
-        # Expand vertices to batch size if needed
         B = features.shape[0]
         if B > 1:
             verts = verts.expand(B, -1, -1)
@@ -61,28 +60,110 @@ class PentachoraEmbedding(nn.Module):
         return PentachoronStabilizer.rose_score_magnitude(features, verts)
     def compute_similarity(self, features: torch.Tensor, mode: str = 'centroid') -> torch.Tensor:
-        """
-        Compute similarity between features and this pentachora.
-        Args:
-            features: [batch, dim] or [batch, seq, dim]
-            mode: 'centroid', 'max' (max over vertices), or 'rose' (Rose score)
-        Returns:
-            similarities: [batch] or [batch, seq]
-        """
         if mode == 'rose':
             return self.compute_rose_score(features)
         features_norm = F.normalize(features, dim=-1)
         if mode == 'centroid':
-            # Dot product with centroid
             return torch.matmul(features_norm, self.centroid_norm)
         else:  # mode == 'max'
-            # Max similarity across vertices
             sims = torch.matmul(features_norm, self.vertices_norm.T)
             return sims.max(dim=-1)[0]
 class TransformerBlock(nn.Module):
@@ -117,25 +198,22 @@ class TransformerBlock(nn.Module):
         )
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        # Self-attention
         x_norm = self.norm1(x)
         attn_out, _ = self.attn(x_norm, x_norm, x_norm)
         x = x + attn_out
-        # MLP
         x = x + self.mlp(self.norm2(x))
         return x
 class BaselineViT(nn.Module):
     """
-    Clean baseline Vision Transformer with frozen pentachora embeddings.
     """
     def __init__(
         self,
-        pentachora_list: list,  # List of torch.Tensor, each [5, vocab_dim]
         vocab_dim: int = 256,
         img_size: int = 32,
         patch_size: int = 4,
@@ -145,25 +223,23 @@ class BaselineViT(nn.Module):
         mlp_ratio: float = 4.0,
         dropout: float = 0.0,
         attn_dropout: float = 0.0,
-        similarity_mode: str = 'rose'  # 'centroid', 'max', or 'rose'
     ):
         super().__init__()
-        # Validate pentachora list
-        assert isinstance(pentachora_list, list), f"Expected list, got {type(pentachora_list)}"
-        assert len(pentachora_list) > 0, "Empty pentachora list"
-        # Validate each pentachora
-        for i, penta in enumerate(pentachora_list):
-            assert isinstance(penta, torch.Tensor), f"Item {i} is not a tensor"
         self.num_classes = len(pentachora_list)
         self.embed_dim = embed_dim
         self.num_patches = (img_size // patch_size) ** 2
         self.similarity_mode = similarity_mode
         self.pentachora_dim = vocab_dim
-        # Create individual pentachora embeddings from list
         self.class_pentachora = nn.ModuleList([
             PentachoraEmbedding(vertices=penta)
             for penta in pentachora_list
@@ -172,7 +248,7 @@ class BaselineViT(nn.Module):
         # Patch embedding
         self.patch_embed = nn.Conv2d(3, embed_dim, kernel_size=patch_size, stride=patch_size)
-        # CLS token - learnable
         self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
         # Position embeddings
@@ -200,23 +276,33 @@ class BaselineViT(nn.Module):
         else:
             self.to_pentachora_dim = nn.Identity()
-        # Temperature for similarity-based classification
-        self.temperature = nn.Parameter(torch.ones(1) * np.log(1/0.07))
-        self.register_buffer(
-            'all_centroids',
-            torch.stack([penta.centroid for penta in self.class_pentachora])
-        )
-        self.register_buffer(
-            'all_centroids_norm',
-            F.normalize(self.all_centroids, dim=-1)
-        )
-        # Initialize weights
         self.init_weights()
     def init_weights(self):
-        """Initialize model weights."""
         nn.init.trunc_normal_(self.cls_token, std=0.02)
         nn.init.trunc_normal_(self.pos_embed, std=0.02)
@@ -229,35 +315,33 @@ class BaselineViT(nn.Module):
                 nn.init.ones_(m.weight)
                 nn.init.zeros_(m.bias)
-    # Then get_class_centroids becomes:
     def get_class_centroids(self) -> torch.Tensor:
-        return self.all_centroids_norm
     def compute_pentachora_similarities(self, features: torch.Tensor) -> torch.Tensor:
-        """
-        Compute similarities between features and all class pentachora (vectorized).
-        """
         if self.similarity_mode == 'rose':
-            # Stack all vertices into single tensor for batch Rose scoring
-            all_vertices = torch.stack([penta.vertices for penta in self.class_pentachora])  # [100, 5, vocab_dim]
-            # Expand features for batch computation
-            features_exp = features.unsqueeze(1).expand(-1, self.num_classes, -1)  # [B, 100, vocab_dim]
-            # Compute Rose scores in parallel
-            return PentachoronStabilizer.rose_score_magnitude(features_exp.reshape(-1, self.embed_dim), all_vertices.repeat(features.shape[0], 1, 1)).reshape(features.shape[0], -1)
         else:
-            # Stack all centroids
-            centroids = torch.stack([penta.centroid_norm for penta in self.class_pentachora])  # [100, vocab_dim]
-            features_norm = F.normalize(features, dim=-1)  # [B, vocab_dim]
-            return torch.matmul(features_norm, centroids.T)  # [B, 100]
     def forward_features(self, x: torch.Tensor) -> torch.Tensor:
-        """Extract features from images."""
         B = x.shape[0]
         # Patch embedding
-        x = self.patch_embed(x)  # [B, embed_dim, H', W']
-        x = x.flatten(2).transpose(1, 2)  # [B, num_patches, embed_dim]
         # Add CLS token
         cls_tokens = self.cls_token.expand(B, -1, -1)
@@ -279,35 +363,31 @@ class BaselineViT(nn.Module):
     def forward(self, x: torch.Tensor, return_features: bool = False) -> Dict[str, torch.Tensor]:
         """
-        Forward pass.
-        Returns dict with:
-            - logits: classification logits
-            - features: CLS features (if return_features=True)
-            - similarities: raw similarities to pentachora
         """
         features = self.forward_features(x)
         output = {}
         # Project to pentachora dimension
         features_proj = self.to_pentachora_dim(features)
-        # Compute similarities based on mode
-        if self.similarity_mode == 'rose':
-            # Use Rose scoring
-            similarities = self.compute_pentachora_similarities(features_proj)
         else:
-            # Use centroid or max similarity
-            features_norm = F.normalize(features_proj, dim=-1)
-            centroids = self.get_class_centroids()
-            similarities = torch.matmul(features_norm, centroids.T)
-        # Scale by temperature
-        logits = similarities * self.temperature.exp()
-        output['logits'] = logits
-        output['similarities'] = similarities
         if return_features:
             output['features'] = features
@@ -315,9 +395,42 @@ class BaselineViT(nn.Module):
         return output
-# Test - requires external setup
 if __name__ == "__main__":
-    print("BaselineViT requires:")
-    print("  1. PentachoronStabilizer loaded externally")
-    print("  2. pentachora_batch tensor [num_classes, 5, vocab_dim]")
-    print("\nNo random initialization. No fallbacks.")

 """
 Baseline Vision Transformer with Frozen Pentachora Embeddings
+Now with optional theta rotation head for better classification
 """
 import torch
 class PentachoraEmbedding(nn.Module):
     """
     A single frozen pentachora embedding (5 vertices in geometric space).
+    Now with theta rotation capabilities.
     """
     def __init__(self, vertices: torch.Tensor):
         super().__init__()
         self.embed_dim = vertices.shape[-1]
         # Store provided vertices as frozen buffer
+        self.register_buffer('vertices', vertices.cpu().contiguous().detach().clone().to(get_default_device()))
         self.vertices.requires_grad = False
         # Precompute normalized versions and centroid
             self.register_buffer('vertices_norm', F.normalize(self.vertices, dim=-1))
             self.register_buffer('centroid', self.vertices.mean(dim=0))
             self.register_buffer('centroid_norm', F.normalize(self.centroid, dim=-1))
+            # Compute theta bases for rotation
+            self.register_buffer('theta_bases', self._compute_theta_bases().cpu().contiguous().detach().clone().to(get_default_device()))
+    def _compute_theta_bases(self) -> torch.Tensor:
+        """Compute orthogonal bases from vertices for theta rotation."""
+        U, S, V = torch.svd(self.vertices)
+        n_components = min(5, self.embed_dim)
+        return V[:, :n_components]  # [embed_dim, n_components]
     def get_vertices(self) -> torch.Tensor:
         return self.vertices
     def get_centroid(self) -> torch.Tensor:
         return self.centroid
     def compute_rose_score(self, features: torch.Tensor) -> torch.Tensor:
+        verts = self.vertices.unsqueeze(0)
         if features.dim() == 1:
             features = features.unsqueeze(0)
         B = features.shape[0]
         if B > 1:
             verts = verts.expand(B, -1, -1)
         return PentachoronStabilizer.rose_score_magnitude(features, verts)
     def compute_similarity(self, features: torch.Tensor, mode: str = 'centroid') -> torch.Tensor:
         if mode == 'rose':
             return self.compute_rose_score(features)
         features_norm = F.normalize(features, dim=-1)
         if mode == 'centroid':
             return torch.matmul(features_norm, self.centroid_norm)
         else:  # mode == 'max'
             sims = torch.matmul(features_norm, self.vertices_norm.T)
             return sims.max(dim=-1)[0]
+    def compute_theta_features(self, features: torch.Tensor) -> torch.Tensor:
+        """
+        Project features to theta space defined by this pentachora.
+        Returns angular features for feedforward classification.
+        """
+        # Project onto pentachora bases
+        projections = torch.matmul(features, self.theta_bases)  # [batch, 5]
+        # Compute angles relative to centroid
+        centroid_proj = torch.matmul(self.centroid.unsqueeze(0), self.theta_bases)
+        angles = torch.atan2(projections, centroid_proj + 1e-8)
+        # Return sin/cos encoding
+        return torch.cat([torch.sin(angles), torch.cos(angles)], dim=-1).to(get_default_device())  # [batch, 10]
+class ThetaHead(nn.Module):
+    """
+    Theta-based classification head using angular representations.
+    Replaces similarity matching with learned feedforward.
+    """
+    def __init__(
+        self,
+        embed_dim: int,
+        num_classes: int,
+        n_pentachora: int = 10,  # Use subset of pentachora for theta
+        hidden_dim: int = 256,
+        dropout: float = 0.1
+    ):
+        super().__init__()
+        self.n_pentachora = n_pentachora
+        self.embed_dim = embed_dim
+        # Each pentachora gives 10 theta features (5 sin + 5 cos)
+        theta_dim = n_pentachora * 10
+        # Project to theta space
+        self.to_theta = nn.Sequential(
+            nn.Linear(embed_dim, hidden_dim),
+            nn.LayerNorm(hidden_dim),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden_dim, theta_dim)
+        )
+        # Classify from theta
+        self.classifier = nn.Sequential(
+            nn.LayerNorm(theta_dim),
+            nn.Dropout(dropout),
+            nn.Linear(theta_dim, num_classes)
+        )
+        # Learnable temperature
+        self.temperature = nn.Parameter(torch.ones(1) * 0.1)
+    def forward(self, features: torch.Tensor, pentachora_list: nn.ModuleList) -> Dict[str, torch.Tensor]:
+        """
+        Classify using theta rotation.
+        Args:
+            features: [batch, embed_dim] CLS features
+            pentachora_list: List of PentachoraEmbedding modules
+        """
+        # Get theta features from first n pentachora
+        theta_features = []
+        for i in range(min(self.n_pentachora, len(pentachora_list))):
+            theta = pentachora_list[i].compute_theta_features(features)
+            theta_features.append(theta)
+        # Concatenate all theta features
+        theta_concat = torch.cat(theta_features, dim=-1)  # [batch, n_pentachora * 10]
+        # If we have fewer pentachora than expected, pad with zeros
+        if len(theta_features) < self.n_pentachora:
+            pad_size = (self.n_pentachora - len(theta_features)) * 10
+            padding = torch.zeros(features.shape[0], pad_size, device=features.device)
+            theta_concat = torch.cat([theta_concat, padding], dim=-1)
+        # Project through MLP
+        theta_proj = self.to_theta(features)
+        # Combine with geometric theta (residual connection)
+        theta_combined = theta_concat + 0.1 * theta_proj
+        # Classify
+        logits = self.classifier(theta_combined) / self.temperature.exp()
+        return {
+            'logits': logits,
+            'theta_features': theta_combined
+        }
 class TransformerBlock(nn.Module):
         )
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         x_norm = self.norm1(x)
         attn_out, _ = self.attn(x_norm, x_norm, x_norm)
         x = x + attn_out
         x = x + self.mlp(self.norm2(x))
         return x
 class BaselineViT(nn.Module):
     """
+    Vision Transformer with optional theta-based classification.
+    Can switch between similarity-based and theta-based heads.
     """
     def __init__(
         self,
+        pentachora_list: list,
         vocab_dim: int = 256,
         img_size: int = 32,
         patch_size: int = 4,
         mlp_ratio: float = 4.0,
         dropout: float = 0.0,
         attn_dropout: float = 0.0,
+        similarity_mode: str = 'rose',
+        use_theta_head: bool = True,  # NEW: Toggle theta head
+        theta_n_pentachora: int = 2,  # NEW: How many pentachora for theta
+        theta_hidden_dim: int = 256    # NEW: Hidden dim for theta MLP
     ):
         super().__init__()
+        assert isinstance(pentachora_list, list) and len(pentachora_list) > 0
         self.num_classes = len(pentachora_list)
         self.embed_dim = embed_dim
         self.num_patches = (img_size // patch_size) ** 2
         self.similarity_mode = similarity_mode
         self.pentachora_dim = vocab_dim
+        self.use_theta_head = use_theta_head
+        # Create pentachora embeddings
         self.class_pentachora = nn.ModuleList([
             PentachoraEmbedding(vertices=penta)
             for penta in pentachora_list
         # Patch embedding
         self.patch_embed = nn.Conv2d(3, embed_dim, kernel_size=patch_size, stride=patch_size)
+        # CLS token
         self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
         # Position embeddings
         else:
             self.to_pentachora_dim = nn.Identity()
+        # Classification heads
+        if use_theta_head:
+            # NEW: Theta-based classification
+            self.theta_head = ThetaHead(
+                embed_dim=self.pentachora_dim,
+                num_classes=self.num_classes,
+                n_pentachora=theta_n_pentachora,
+                hidden_dim=theta_hidden_dim,
+                dropout=dropout
+            )
+        else:
+            # Original: Similarity-based classification
+            self.theta_head = None
+            self.temperature = nn.Parameter(torch.ones(1) * np.log(1/0.07))
+            self.register_buffer(
+                'all_centroids',
+                torch.stack([penta.centroid for penta in self.class_pentachora])
+            )
+            self.register_buffer(
+                'all_centroids_norm',
+                F.normalize(self.all_centroids, dim=-1)
+            )
         self.init_weights()
     def init_weights(self):
         nn.init.trunc_normal_(self.cls_token, std=0.02)
         nn.init.trunc_normal_(self.pos_embed, std=0.02)
                 nn.init.ones_(m.weight)
                 nn.init.zeros_(m.bias)
     def get_class_centroids(self) -> torch.Tensor:
+        if self.use_theta_head:
+            # Return centroids from pentachora for compatibility
+            centroids = torch.stack([penta.centroid_norm for penta in self.class_pentachora])
+            return centroids
+        else:
+            return self.all_centroids_norm
     def compute_pentachora_similarities(self, features: torch.Tensor) -> torch.Tensor:
         if self.similarity_mode == 'rose':
+            all_vertices = torch.stack([penta.vertices for penta in self.class_pentachora])
+            features_exp = features.unsqueeze(1).expand(-1, self.num_classes, -1)
+            return PentachoronStabilizer.rose_score_magnitude(
+                features_exp.reshape(-1, self.pentachora_dim),
+                all_vertices.repeat(features.shape[0], 1, 1)
+            ).reshape(features.shape[0], -1)
         else:
+            centroids = torch.stack([penta.centroid_norm for penta in self.class_pentachora])
+            features_norm = F.normalize(features, dim=-1)
+            return torch.matmul(features_norm, centroids.T)
     def forward_features(self, x: torch.Tensor) -> torch.Tensor:
         B = x.shape[0]
         # Patch embedding
+        x = self.patch_embed(x)
+        x = x.flatten(2).transpose(1, 2)
         # Add CLS token
         cls_tokens = self.cls_token.expand(B, -1, -1)
     def forward(self, x: torch.Tensor, return_features: bool = False) -> Dict[str, torch.Tensor]:
         """
+        Forward pass with optional theta head.
         """
         features = self.forward_features(x)
         output = {}
         # Project to pentachora dimension
         features_proj = self.to_pentachora_dim(features)
+        if self.use_theta_head:
+            # NEW: Use theta-based classification
+            theta_output = self.theta_head(features_proj, self.class_pentachora)
+            output['logits'] = theta_output['logits']
+            output['theta_features'] = theta_output['theta_features']
+            # Still compute similarities for analysis
+            with torch.no_grad():
+                similarities = self.compute_pentachora_similarities(features_proj)
+                output['similarities'] = similarities
         else:
+            # Original: Use similarity-based classification
+            similarities = self.compute_pentachora_similarities(features_proj)
+            logits = similarities * self.temperature.exp()
+            output['logits'] = logits
+            output['similarities'] = similarities
         if return_features:
             output['features'] = features
         return output
+# Helper function to convert existing model to theta
+def enable_theta_head(model: BaselineViT, n_pentachora: int = 10, hidden_dim: int = 256):
+    """
+    Convert an existing similarity-based model to use theta head.
+    This modifies the model in-place.
+    """
+    if model.use_theta_head:
+        print("Model already using theta head")
+        return model
+    print(f"Converting to theta head with {n_pentachora} pentachora...")
+    # Create theta head
+    model.theta_head = ThetaHead(
+        embed_dim=model.pentachora_dim,
+        num_classes=model.num_classes,
+        n_pentachora=n_pentachora,
+        hidden_dim=hidden_dim,
+        dropout=0.1
+    ).to(next(model.parameters()).device)
+    # Set flag
+    model.use_theta_head = True
+    # Initialize new parameters
+    for m in model.theta_head.modules():
+        if isinstance(m, nn.Linear):
+            nn.init.trunc_normal_(m.weight, std=0.02)
+            if m.bias is not None:
+                nn.init.zeros_(m.bias)
+    print("✓ Theta head enabled")
+    return model
 if __name__ == "__main__":
+    print("BaselineViT with optional theta head")
+    print("Use 'use_theta_head=True' to enable theta classification")
+    print("Or call enable_theta_head() on existing model")