AbstractPhil
/

penta-vit-experiments

Zero-Shot Classification

TensorBoard

Model card Files Files and versions

xet

Metrics Training metrics Community

AbstractPhil commited on Sep 13, 2025

Commit

3678161

verified ·

1 Parent(s): c3d8b53

Create vit_zana_v4_l1.py

Browse files

Files changed (1) hide show

vit_zana_v4_l1.py +368 -0

vit_zana_v4_l1.py ADDED Viewed

	@@ -0,0 +1,368 @@

+"""
+Baseline Vision Transformer with Frozen Pentachora Embeddings
+Adapted for L1-normalized pentachora vertices
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from einops import rearrange
+import math
+from typing import Optional, Tuple, Dict, Any
+class PentachoraEmbedding(nn.Module):
+    """
+    A single frozen pentachora embedding (5 vertices in geometric space).
+    Supports both L1 and L2 normalized vertices.
+    """
+    def __init__(self, vertices: torch.Tensor, norm_type: str = 'l1'):
+        super().__init__()
+        self.embed_dim = vertices.shape[-1]
+        self.norm_type = norm_type
+        # Store provided vertices as frozen buffer
+        self.register_buffer('vertices', vertices)
+        self.vertices.requires_grad = False
+        # Precompute normalized versions and centroid
+        with torch.no_grad():
+            # For L1-normalized data, use L1 norm for consistency
+            if norm_type == 'l1':
+                # L1 normalize (sum of abs values = 1)
+                self.register_buffer('vertices_norm',
+                    vertices / (vertices.abs().sum(dim=-1, keepdim=True) + 1e-8))
+            else:
+                # L2 normalize (euclidean norm = 1)
+                self.register_buffer('vertices_norm', F.normalize(self.vertices, dim=-1))
+            self.register_buffer('centroid', self.vertices.mean(dim=0))
+            # Centroid normalization matches vertex normalization
+            if norm_type == 'l1':
+                self.register_buffer('centroid_norm',
+                    self.centroid / (self.centroid.abs().sum() + 1e-8))
+            else:
+                self.register_buffer('centroid_norm', F.normalize(self.centroid, dim=-1))
+    def get_vertices(self) -> torch.Tensor:
+        """Get all 5 vertices."""
+        return self.vertices
+    def get_centroid(self) -> torch.Tensor:
+        """Get the centroid of the pentachora."""
+        return self.centroid
+    def compute_rose_score(self, features: torch.Tensor) -> torch.Tensor:
+        """
+        Compute Rose similarity score with this pentachora.
+        Scaled appropriately for L1 norm.
+        """
+        verts = self.vertices.unsqueeze(0)  # [1, 5, D]
+        if features.dim() == 1:
+            features = features.unsqueeze(0)
+        B = features.shape[0]
+        if B > 1:
+            verts = verts.expand(B, -1, -1)
+        # For L1 norm, scale the rose score appropriately
+        score = PentachoronStabilizer.rose_score_magnitude(features, verts)
+        if self.norm_type == 'l1':
+            # L1 norm produces smaller values, so amplify the signal
+            score = score * 10.0
+        return score
+    def compute_similarity(self, features: torch.Tensor, mode: str = 'centroid') -> torch.Tensor:
+        """
+        Compute similarity between features and this pentachora.
+        """
+        if mode == 'rose':
+            return self.compute_rose_score(features)
+        # Normalize features according to norm type
+        if self.norm_type == 'l1':
+            features_norm = features / (features.abs().sum(dim=-1, keepdim=True) + 1e-8)
+        else:
+            features_norm = F.normalize(features, dim=-1)
+        if mode == 'centroid':
+            # Dot product with centroid
+            sim = torch.sum(features_norm * self.centroid_norm, dim=-1)
+            # Scale up L1 similarities to be comparable to L2
+            if self.norm_type == 'l1':
+                sim = sim * 10.0
+            return sim
+        else:  # mode == 'max'
+            # Max similarity across vertices
+            sims = torch.matmul(features_norm, self.vertices_norm.T)
+            if self.norm_type == 'l1':
+                sims = sims * 10.0
+            return sims.max(dim=-1)[0]
+class TransformerBlock(nn.Module):
+    """Standard transformer block with multi-head attention and MLP."""
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        mlp_ratio: float = 4.0,
+        dropout: float = 0.0,
+        attn_dropout: float = 0.0
+    ):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(dim)
+        self.attn = nn.MultiheadAttention(
+            dim,
+            num_heads,
+            dropout=attn_dropout,
+            batch_first=True
+        )
+        self.norm2 = nn.LayerNorm(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = nn.Sequential(
+            nn.Linear(dim, mlp_hidden_dim),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(mlp_hidden_dim, dim),
+            nn.Dropout(dropout)
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # Self-attention
+        x_norm = self.norm1(x)
+        attn_out, _ = self.attn(x_norm, x_norm, x_norm)
+        x = x + attn_out
+        # MLP
+        x = x + self.mlp(self.norm2(x))
+        return x
+class BaselineViT(nn.Module):
+    """
+    Vision Transformer with frozen pentachora embeddings.
+    Supports L1-normalized pentachora.
+    """
+    def __init__(
+        self,
+        pentachora_list: list,  # List of torch.Tensor, each [5, vocab_dim]
+        vocab_dim: int = 256,
+        img_size: int = 32,
+        patch_size: int = 4,
+        embed_dim: int = 512,
+        depth: int = 12,
+        num_heads: int = 8,
+        mlp_ratio: float = 4.0,
+        dropout: float = 0.0,
+        attn_dropout: float = 0.0,
+        similarity_mode: str = 'rose',  # 'centroid', 'max', or 'rose'
+        norm_type: str = 'l1'  # 'l1' or 'l2' normalization
+    ):
+        super().__init__()
+        # Validate pentachora list
+        assert isinstance(pentachora_list, list), f"Expected list, got {type(pentachora_list)}"
+        assert len(pentachora_list) > 0, "Empty pentachora list"
+        for i, penta in enumerate(pentachora_list):
+            assert isinstance(penta, torch.Tensor), f"Item {i} is not a tensor"
+        self.num_classes = len(pentachora_list)
+        self.embed_dim = embed_dim
+        self.num_patches = (img_size // patch_size) ** 2
+        self.similarity_mode = similarity_mode
+        self.pentachora_dim = vocab_dim
+        self.norm_type = norm_type
+        # Create individual pentachora embeddings from list
+        self.class_pentachora = nn.ModuleList([
+            PentachoraEmbedding(vertices=penta, norm_type=norm_type)
+            for penta in pentachora_list
+        ])
+        # Patch embedding
+        self.patch_embed = nn.Conv2d(3, embed_dim, kernel_size=patch_size, stride=patch_size)
+        # CLS token - learnable
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        # Position embeddings
+        self.pos_embed = nn.Parameter(torch.zeros(1, 1 + self.num_patches, embed_dim))
+        self.pos_drop = nn.Dropout(dropout)
+        # Transformer blocks
+        self.blocks = nn.ModuleList([
+            TransformerBlock(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                dropout=dropout,
+                attn_dropout=attn_dropout
+            )
+            for i in range(depth)
+        ])
+        # Final norm
+        self.norm = nn.LayerNorm(embed_dim)
+        # Project to pentachora dimension if needed
+        if self.pentachora_dim != embed_dim:
+            self.to_pentachora_dim = nn.Linear(embed_dim, self.pentachora_dim)
+        else:
+            self.to_pentachora_dim = nn.Identity()
+        # Temperature for similarity-based classification
+        # For L1 norm, start with lower temperature since similarities are scaled
+        if norm_type == 'l1':
+            self.temperature = nn.Parameter(torch.zeros(1))  # exp(0) = 1
+        else:
+            self.temperature = nn.Parameter(torch.ones(1) * np.log(1/0.07))
+        # Precompute all centroids for efficiency
+        self.register_buffer(
+            'all_centroids',
+            torch.stack([penta.centroid for penta in self.class_pentachora])
+        )
+        # Normalize centroids according to norm type
+        if norm_type == 'l1':
+            centroids_normalized = self.all_centroids / (
+                self.all_centroids.abs().sum(dim=-1, keepdim=True) + 1e-8)
+        else:
+            centroids_normalized = F.normalize(self.all_centroids, dim=-1)
+        self.register_buffer('all_centroids_norm', centroids_normalized)
+        # Initialize weights
+        self.init_weights()
+    def init_weights(self):
+        """Initialize model weights."""
+        nn.init.trunc_normal_(self.cls_token, std=0.02)
+        nn.init.trunc_normal_(self.pos_embed, std=0.02)
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.trunc_normal_(m.weight, std=0.02)
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.LayerNorm):
+                nn.init.ones_(m.weight)
+                nn.init.zeros_(m.bias)
+    def get_class_centroids(self) -> torch.Tensor:
+        return self.all_centroids_norm
+    def compute_pentachora_similarities(self, features: torch.Tensor) -> torch.Tensor:
+        """
+        Compute similarities between features and all class pentachora.
+        Properly scaled for L1 or L2 norm.
+        """
+        if self.similarity_mode == 'rose':
+            # Stack all vertices into single tensor for batch Rose scoring
+            all_vertices = torch.stack([penta.vertices for penta in self.class_pentachora])
+            features_exp = features.unsqueeze(1).expand(-1, self.num_classes, -1)
+            scores = PentachoronStabilizer.rose_score_magnitude(
+                features_exp.reshape(-1, self.pentachora_dim),
+                all_vertices.repeat(features.shape[0], 1, 1)
+            ).reshape(features.shape[0], -1)
+            # Scale for L1 norm
+            if self.norm_type == 'l1':
+                scores = scores * 10.0
+            return scores
+        else:
+            # Normalize features according to norm type
+            if self.norm_type == 'l1':
+                features_norm = features / (features.abs().sum(dim=-1, keepdim=True) + 1e-8)
+            else:
+                features_norm = F.normalize(features, dim=-1)
+            centroids = self.get_class_centroids()
+            sims = torch.matmul(features_norm, centroids.T)
+            # Scale for L1 norm
+            if self.norm_type == 'l1':
+                sims = sims * 10.0
+            return sims
+    def forward_features(self, x: torch.Tensor) -> torch.Tensor:
+        """Extract features from images."""
+        B = x.shape[0]
+        # Patch embedding
+        x = self.patch_embed(x)  # [B, embed_dim, H', W']
+        x = x.flatten(2).transpose(1, 2)  # [B, num_patches, embed_dim]
+        # Add CLS token
+        cls_tokens = self.cls_token.expand(B, -1, -1)
+        x = torch.cat([cls_tokens, x], dim=1)
+        # Add position embeddings
+        x = x + self.pos_embed
+        x = self.pos_drop(x)
+        # Apply transformer blocks
+        for block in self.blocks:
+            x = block(x)
+        # Final norm
+        x = self.norm(x)
+        # Return CLS token
+        return x[:, 0]
+    def forward(self, x: torch.Tensor, return_features: bool = False) -> Dict[str, torch.Tensor]:
+        """
+        Forward pass.
+        Returns dict with:
+            - logits: classification logits
+            - features: CLS features (if return_features=True)
+            - features_proj: projected features in pentachora space
+            - similarities: raw similarities to pentachora
+        """
+        features = self.forward_features(x)
+        output = {}
+        # Project to pentachora dimension
+        features_proj = self.to_pentachora_dim(features)
+        # Apply appropriate normalization for projected features
+        if self.norm_type == 'l1':
+            # L1 normalize the projected features
+            features_proj = features_proj / (features_proj.abs().sum(dim=-1, keepdim=True) + 1e-8)
+        # Compute similarities
+        similarities = self.compute_pentachora_similarities(features_proj)
+        # Scale by temperature
+        logits = similarities * self.temperature.exp()
+        output['logits'] = logits
+        output['similarities'] = similarities
+        if return_features:
+            output['features'] = features  # Original transformer features
+            output['features_proj'] = features_proj  # Projected features
+        return output
+# Test - requires external setup
+if __name__ == "__main__":
+    print("BaselineViT requires:")
+    print("  1. PentachoronStabilizer loaded externally")
+    print("  2. pentachora_batch tensor [num_classes, 5, vocab_dim]")
+    print("\nNo random initialization. No fallbacks.")