Spaces:

COSMOS-Empath-SHP
/

EmpathNet

Sleeping

App Files Files Community

prekshyam commited on Aug 1, 2025

Commit

024d344

verified ·

1 Parent(s): 54c33cb

Added missing ViTForEmotionClassification class

Browse files

Files changed (1) hide show

maevit.py +69 -0

maevit.py CHANGED Viewed

@@ -245,6 +245,75 @@ class MAEViT(nn.Module):
         return loss
 class ViTForEmotionClassificationMLP(ViTForEmotionClassification):
     """
     Replace the linear head with MLP

         return loss
+# for finetuning
+class ViTForEmotionClassification(nn.Module):
+    """
+    ViT For classification
+    Encoder only
+    """
+    def __init__(
+        self,
+        # default values for ViT-B-16
+        image_size: int = 224,
+        patch_size: int = 16,
+        in_chans: int = 3,
+        embed_dim: int = 768,
+        encoder_layers: int = 12,
+        encoder_heads: int = 12,
+        mlp_ratio: float = 4.0,
+        dropout: float = 0.0,
+        num_classes: int = 9,  # Number of emotion classes #changed by Preksha was originally 7
+    ):
+        super().__init__()
+        assert image_size % patch_size == 0, "Image size must be divisible by patch size"
+        self.patch_size = patch_size
+        self.conv_proj = nn.Conv2d(
+            in_channels = in_chans,
+            out_channels = embed_dim, #embed_dim is for the TOTAL; this is patch_dimen^2 * 3 (# of color channels)
+            kernel_size = patch_size, #this is so that the kernel is basically the patch (a square)
+            stride = patch_size #this ensures that the kernel moves so that the patches do not overlap
+        )
+        num_patches = (image_size // patch_size) ** 2
+        self.enc_pos_embed = nn.Parameter(torch.empty(1, num_patches + 1, embed_dim))
+        nn.init.normal_(self.enc_pos_embed, std=0.02)
+        # set CLS token, a class token that contains a learnable vector that will eventually contain embeddings for the whole image
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        nn.init.normal_(self.cls_token, std = 0.02) #normal distribution
+        #Transformer encoder: learns contextual relationships b/t patches, generates embeddings
+        enc_layer = TransformerEncoderLayer(
+            embed_dim = embed_dim,
+            num_heads = encoder_heads, #for multihead attn
+            mlp_dim = int(embed_dim * mlp_ratio),
+            dropout = dropout #used in MLP
+        )
+        self.encoder = TransformerEncoder(enc_layer, encoder_layers, embed_dim) #does self attn & feed forward
+        self.norm = nn.LayerNorm(embed_dim)
+        self.head_norm = nn.LayerNorm(embed_dim)
+        self.head = nn.Linear(embed_dim, num_classes)  # 9 emotions
+    def forward(self, imgs):
+        # 1. Patch embedding
+        x = self.conv_proj(imgs)            # [B, embed_dim, H/ps, W/ps]
+        x = x.flatten(2).transpose(1, 2)    # [B, N, embed_dim]
+        x = self.norm(x)                  # [B, N, embed_dim]
+        B, N, D = x.shape
+        cls_tokens = self.cls_token.expand(B, -1, -1)  # repeat for batch size
+        x = torch.cat([cls_tokens, x], dim=1)   # [B, N+1, embed_dim]
+        x = x + self.enc_pos_embed
+        x = self.encoder(x)
+        logits = self.head(self.head_norm(x[:, 0]))  # Use the class token for classification
+        return logits
 class ViTForEmotionClassificationMLP(ViTForEmotionClassification):
     """
     Replace the linear head with MLP