Siran-Li
/

MATCHA

+"""
+model.py — MATCHA contrastive model architecture.
+ContrastiveModel wraps a pretrained language model backbone and adds a
+SenseNetwork that decomposes word embeddings into multiple "sense" vectors,
+followed by a learned transformation and mean-pooling to produce a single
+sentence embedding for contrastive learning.
+"""
+import torch
+import torch.nn as nn
+from transformers.pytorch_utils import Conv1D
+from transformers.activations import ACT2FN
+from typing import Optional, Tuple
+class ContrastiveModel(nn.Module):
+    """Top-level model: backbone word embeddings -> SenseNetwork -> projection.
+    Args:
+        contxtl_model: Pretrained HuggingFace model used only for its embedding layer.
+        config: SimpleNamespace with model_type, n_embd, num_senses, etc.
+    """
+    def __init__(self, contxtl_model, config):
+        super().__init__()
+        self.sense_network = SenseNetwork(config)
+        self.contxtl_model = contxtl_model
+        # Extract the word embedding layer from the backbone
+        if config.model_type in ['gpt2', 'gpt_neo', 'roberta', 'xlm-roberta']:
+            self.word_embeddings = self.contxtl_model.get_input_embeddings()
+        elif config.model_type in ['mistral']:
+            self.word_embeddings = self.contxtl_model.model.embed_tokens
+        # Learnable transformation applied to sense vectors before pooling
+        self.transformation_matrix = nn.Parameter(torch.randn(config.n_embd, config.n_embd))
+    def get_model_output(self, input_ids):
+        """Compute multi-sense embeddings from token IDs."""
+        sense_input_embeds = self.word_embeddings(input_ids)  # (bs, s, d)
+        senses = self.sense_network(sense_input_embeds)       # (bs, nv, s, d)
+        return senses
+    def forward(self, input_ids):
+        """Produce a single sentence embedding by mean-pooling transformed senses.
+        Returns:
+            embedding: Tensor of shape (bs, d)
+        """
+        assert not torch.isnan(input_ids).any(), "Input IDs contain NaN values"
+        senses = self.get_model_output(input_ids)              # (bs, nv, s, d)
+        transformed_senses = senses @ self.transformation_matrix  # (bs, nv, s, d)
+        embedding = transformed_senses.mean(dim=(1, 2))        # (bs, d)
+        return embedding
+class MLP(nn.Module):
+    """Feed-forward block: linear -> activation -> linear -> dropout.
+    Uses HuggingFace's Conv1D (equivalent to a linear layer applied
+    along the last dimension) for compatibility with GPT-2 style configs.
+    """
+    def __init__(self, embed_dim, intermediate_dim, out_dim, config):
+        super().__init__()
+        self.c_fc = Conv1D(intermediate_dim, embed_dim)
+        self.c_proj = Conv1D(out_dim, intermediate_dim)
+        self.act = ACT2FN[config.activation_function]
+        self.dropout = nn.Dropout(config.resid_pdrop)
+    def forward(self, hidden_states: Optional[Tuple[torch.FloatTensor]]) -> torch.FloatTensor:
+        hidden_states = self.c_fc(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.c_proj(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+class NoMixBlock(nn.Module):
+    """Transformer-style block *without* attention (no token mixing).
+    Applies two residual sub-layers with layer normalization and dropout,
+    where the only transformation is an MLP — tokens are processed independently.
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.ln_1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
+        self.ln_2 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
+        self.mlp = MLP(config.n_embd, config.n_embd * 4, config.n_embd, config)
+        self.resid_dropout1 = nn.Dropout(config.resid_pdrop)
+        self.resid_dropout2 = nn.Dropout(config.resid_pdrop)
+    def forward(self, hidden_states, residual):
+        residual = self.resid_dropout1(hidden_states) + residual
+        hidden_states = self.ln_1(residual)
+        mlp_out = self.mlp(hidden_states)
+        residual = self.resid_dropout2(mlp_out) + residual
+        hidden_states = self.ln_2(residual)
+        return hidden_states
+class SenseNetwork(nn.Module):
+    """Decomposes token embeddings into multiple sense vectors.
+    Each token is mapped from (d,) to (num_senses, d) via a NoMixBlock
+    followed by an MLP that expands the embedding dimension and reshapes.
+    Input:  (bs, s, d)
+    Output: (bs, num_senses, s, d)
+    """
+    def __init__(self, config, device=None, dtype=None):
+        super().__init__()
+        self.num_senses = config.num_senses
+        self.n_embd = config.n_embd
+        self.dropout = nn.Dropout(config.embd_pdrop)
+        self.block = NoMixBlock(config)
+        self.ln = nn.LayerNorm(self.n_embd, eps=config.layer_norm_epsilon)
+        self.final_mlp = MLP(
+            embed_dim=config.n_embd,
+            intermediate_dim=config.sense_intermediate_scale * config.n_embd,
+            out_dim=config.n_embd * config.num_senses,
+            config=config,
+        )
+    def forward(self, input_embeds):
+        residual = self.dropout(input_embeds)
+        hidden_states = self.ln(residual)
+        hidden_states = self.block(hidden_states, residual)
+        senses = self.final_mlp(hidden_states)
+        bs, s, nvd = senses.shape
+        # Reshape from (bs, s, num_senses*d) -> (bs, num_senses, s, d)
+        return senses.reshape(bs, s, self.num_senses, self.n_embd).transpose(1, 2)