lfw3
/

Genshin_Stats_MLP_to_CLIP

Model card Files Files and versions

xet

Community

lfw3 commited on Nov 28, 2025

Commit

942ca9e

verified ·

1 Parent(s): 0d08f16

Updated model to match weight shape

Browse files

Files changed (1) hide show

clip_mlp.py +118 -65

clip_mlp.py CHANGED Viewed

@@ -1,65 +1,118 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-class CLIPEmbeddingMLP(nn.Module):
-    def __init__(
-        self,
-        clip_dim=512,  # CLIP text embedding dimension (512 for ViT-B/32, 768 for ViT-L/14)
-        string_embed_dim=512,  # dimension for string embeddings from CLIP
-        hidden_dims=[1024, 1024, 512],  # hidden layer dimensions
-    ):
-        super().__init__()
-        # Calculate total input dimension
-        # 3 string embeddings + 4 categorical embeddings
-        input_dim = 3 * string_embed_dim + 4 * 5
-        # Build MLP layers
-        layers = []
-        prev_dim = input_dim
-        for hidden_dim in hidden_dims:
-            layers.extend([
-                nn.Linear(prev_dim, hidden_dim),
-                nn.LayerNorm(hidden_dim),
-                nn.ReLU(),
-                nn.Dropout(0.1)
-            ])
-            prev_dim = hidden_dim
-        # Final projection to CLIP dimension
-        layers.append(nn.Linear(prev_dim, clip_dim))
-        self.mlp = nn.Sequential(*layers)
-    def forward(self, string_embeds, categorical_inputs):
-        """
-        Args:
-            string_embeds: tensor of shape (batch_size, 3, string_embed_dim)
-                          Pre-computed embeddings for the 3 string inputs
-            categorical_inputs: tensor of shape (batch_size, 4) with values in [0, 4]
-                               Integer indices for the 4 categorical inputs
-        Returns:
-            clip_embeddings: tensor of shape (batch_size, clip_dim)
-        """
-        batch_size = string_embeds.shape[0]
-        # Flatten string embeddings
-        string_flat = string_embeds.reshape(batch_size, -1)
-        # Convert categorical inputs to one-hot vectors
-        # Each categorical input becomes a one-hot vector of size 5
-        cat_onehot = F.one_hot(categorical_inputs.long(), num_classes=5)  # (batch_size, 4, 5)
-        cat_flat = cat_onehot.reshape(batch_size, -1).float()  # (batch_size, 20)
-        # Concatenate all inputs
-        combined = torch.cat([string_flat, cat_flat], dim=1)
-        # Pass through MLP
-        output = self.mlp(combined)
-        # L2 normalize to match CLIP embeddings
-        output = output / output.norm(dim=-1, keepdim=True)
-        return output

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class CLIPOffsetMLP(nn.Module):
+    """
+    MLP that predicts an offset in CLIP embedding space.
+    Architecture: concatenated [one-hot vectors, CLIP text embeddings] -> MLP -> offset vector
+    Final embedding: E_pred = E_base + E_offset
+    """
+    def __init__(
+        self,
+        clip_dim=512,  # CLIP embedding dimension (512 for ViT-B/32, 768 for ViT-L/14)
+        string_embed_dim=512,  # dimension for string embeddings from CLIP
+        num_categories_per_attr=[6, 7, 5, 5],  # number of categories for each discrete attribute
+        hidden_dims=[1024, 1024, 512],  # hidden layer dimensions
+        normalize_inputs=True,  # normalize components before concatenation
+    ):
+        super().__init__()
+        self.clip_dim = clip_dim
+        self.string_embed_dim = string_embed_dim
+        self.num_categories_per_attr = num_categories_per_attr
+        self.normalize_inputs = normalize_inputs
+        # Calculate input dimensions
+        num_discrete_attrs = len(num_categories_per_attr)
+        total_onehot_dim = sum(num_categories_per_attr)
+        # Assuming 3 textual attributes (constellation, affiliation, etc.)
+        num_text_attrs = 3
+        total_text_dim = num_text_attrs * string_embed_dim
+        # Total input dimension after concatenation
+        input_dim = total_onehot_dim + total_text_dim
+        # Build MLP layers
+        layers = []
+        prev_dim = input_dim
+        for hidden_dim in hidden_dims:
+            layers.extend([
+                nn.Linear(prev_dim, hidden_dim),
+                nn.LayerNorm(hidden_dim),
+                nn.ReLU(),
+                nn.Dropout(0.1)
+            ])
+            prev_dim = hidden_dim
+        # Final projection to CLIP dimension (offset vector)
+        layers.append(nn.Linear(prev_dim, clip_dim))
+        self.mlp = nn.Sequential(*layers)
+    def forward(self, string_embeds, categorical_inputs, base_text_embed):
+        """
+        Args:
+            string_embeds: tensor of shape (batch_size, num_text_attrs, string_embed_dim)
+                          Pre-computed CLIP embeddings for textual attributes
+            categorical_inputs: tensor of shape (batch_size, num_discrete_attrs)
+                               Integer indices for discrete attributes
+            base_text_embed: tensor of shape (batch_size, clip_dim) or (1, clip_dim)
+                            Base text embedding for "Genshin-style character"
+        Returns:
+            pred_embeddings: tensor of shape (batch_size, clip_dim)
+                            E_pred = E_base + E_offset
+        """
+        batch_size = string_embeds.shape[0]
+        # 1. Process one-hot vectors for discrete attributes
+        onehot_vectors = []
+        for i, num_cats in enumerate(self.num_categories_per_attr):
+            onehot = F.one_hot(categorical_inputs[:, i].long(), num_classes=num_cats)
+            onehot_vectors.append(onehot.float())
+        x_onehot = torch.cat(onehot_vectors, dim=1)  # (batch_size, total_onehot_dim)
+        # 2. Process text embeddings
+        x_text = string_embeds.reshape(batch_size, -1)  # (batch_size, num_text_attrs * embed_dim)
+        # 3. Normalize components before concatenation (as per spec)
+        if self.normalize_inputs:
+            # Normalize one-hot vector (L2 norm)
+            x_onehot = F.normalize(x_onehot, p=2, dim=1)
+            # Normalize text embeddings (L2 norm)
+            x_text = F.normalize(x_text, p=2, dim=1)
+        # 4. Concatenate: x_input = [x_onehot, E_text_attr]
+        x_input = torch.cat([x_onehot, x_text], dim=1)
+        # 5. Pass through MLP to get offset vector
+        offset = self.mlp(x_input)
+        # 6. Add offset to base embedding: E_pred = E_text + E_offset
+        # Handle broadcasting if base_text_embed is (1, clip_dim)
+        if base_text_embed.shape[0] == 1 and batch_size > 1:
+            base_text_embed = base_text_embed.expand(batch_size, -1)
+        pred_embeddings = base_text_embed + offset
+        # 7. Normalize final embedding (CLIP embeddings are typically normalized)
+        pred_embeddings = F.normalize(pred_embeddings, p=2, dim=1)
+        return pred_embeddings
+    def inference(self, string_embeds, categorical_inputs, base_text_embed):
+        """
+        Inference mode - identical to forward pass but explicitly named for clarity.
+        Args:
+            string_embeds: CLIP embeddings of textual attributes
+            categorical_inputs: Integer indices for discrete attributes
+            base_text_embed: Base embedding for "Genshin-style character"
+        Returns:
+            E_star: Conditioning vector for diffusion model
+        """
+        return self.forward(string_embeds, categorical_inputs, base_text_embed)