import torch import torch.nn.functional as F from torch import nn from transformers.models.t5.configuration_t5 import T5Config class M5Pooler(nn.Module): def __init__(self, config: T5Config): super().__init__() self.pool_weights = nn.Parameter(torch.tensor([0.5, 0.5])) self.pad_token_id = config.pad_token_id def forward(self, input_ids: torch.Tensor, hidden_states: torch.Tensor) -> torch.Tensor: mask = (input_ids[:, 1:] != self.pad_token_id).unsqueeze(-1).float() # [batch, seq_len, 1] atoms = hidden_states[:, 1:, :] # Zero out padding token embeddings masked_embedded = atoms * mask # [batch, seq_len, hidden_dim] # Sum and divide by number of real tokens sum_embedded = masked_embedded.sum(dim=1) # [batch, hidden_dim] num_real_tokens = mask.sum(dim=1).clamp(min=1e-9) # [batch, 1], avoid division by zero mean_pool = sum_embedded / num_real_tokens # [batch, hidden_dim] cls_token = hidden_states[:, 0, :] # Learned weights for weighted average between CLS and non CLS tokens weights = F.softmax(self.pool_weights, dim=0) pooled = weights[0] * mean_pool + weights[1] * cls_token return pooled