ltg
/

norbert3-xs

@@ -139,10 +139,10 @@ class Attention(nn.Module):
         return bucket_pos
     def forward(self, hidden_states, attention_mask, relative_embedding):
-        key_len, batch_size, _ = hidden_states.size()
         query_len = key_len
-        # Ensure position indices are large enough
         if self.position_indices.size(0) < query_len:
             position_indices = torch.arange(query_len, dtype=torch.long).unsqueeze(1) \
                 - torch.arange(query_len, dtype=torch.long).unsqueeze(0)
@@ -150,55 +150,52 @@ class Attention(nn.Module):
             position_indices = self.config.position_bucket_size - 1 + position_indices
             self.position_indices = position_indices.to(hidden_states.device)
-        hidden_states = self.pre_layer_norm(hidden_states)
-        # QKV linear projections
-        query, key = self.in_proj_qk(hidden_states).chunk(2, dim=2)  # shape: [T, B, D]
-        value = self.in_proj_v(hidden_states)  # shape: [T, B, D]
-        # Reshape and transpose for attention computation
-        query = query.reshape(query_len, batch_size * self.num_heads, self.head_size).transpose(0, 1)
-        key = key.reshape(key_len, batch_size * self.num_heads, self.head_size).transpose(0, 1)
-        value = value.view(key_len, batch_size * self.num_heads, self.head_size).transpose(0, 1)
-        # Content-based attention
-        attention_scores = torch.bmm(query, key.transpose(1, 2) * self.scale)
-        # Positional embeddings
-        pos = self.in_proj_qk(self.dropout(relative_embedding))  # [2T-1, 2D]
-        query_pos, key_pos = pos.view(-1, self.num_heads, 2*self.head_size).chunk(2, dim=2)  # [2T-1, H, D]
-        # Reshape query and key for positional attention
-        query = query.view(batch_size, self.num_heads, query_len, self.head_size)  # [B, H, Q, D]
-        key = key.view(batch_size, self.num_heads, key_len, self.head_size)  # [B, H, K, D]
-        # Get relative position indices
-        rel_pos = self.position_indices[:query_len, :key_len]  # [Q, K]
-        # Select positional embeddings based on relative positions
-        key_pos_rel = key_pos[rel_pos]  # [Q, K, H, D]
-        query_pos_rel = query_pos[rel_pos]  # [Q, K, H, D]
-        # Compute disentangled attention scores
-        attention_c_p = torch.einsum("bhqd,qkhd->bhqk", query, key_pos_rel * self.scale)  # [B, H, Q, K]
-        attention_p_c = torch.einsum("bhkd,qkhd->bhqk", key * self.scale, query_pos_rel)  # [B, H, Q, K]
-        # Combine attention scores
-        attention_scores = attention_scores.view(batch_size, self.num_heads, query_len, key_len)  # [B, H, Q, K]
-        attention_scores.add_(attention_c_p)
-        attention_scores.add_(attention_p_c)
-        attention_scores = attention_scores.masked_fill(attention_mask, float('-inf'))
-        attention_probs = F.softmax(attention_scores, dim=-1)
-        attention_probs = self.dropout(attention_probs)
-        context = torch.bmm(attention_probs.flatten(0, 1), value)  # shape: [B*H, Q, D]
-        context = context.transpose(0, 1).reshape(context.size(1), -1, self.hidden_size)  # shape: [Q, B, H*D]
-        context = self.out_proj(context)
-        context = self.post_layer_norm(context)
-        context = self.dropout(context)
-        return context, attention_probs.detach()
 class Embedding(nn.Module):
@@ -281,9 +278,8 @@ class NorbertModel(NorbertPreTrainedModel):
             attention_mask = ~attention_mask.bool()
         attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
-        static_embeddings, relative_embedding = self.embedding(input_ids.t())
         contextualized_embeddings, attention_probs = self.transformer(static_embeddings, attention_mask, relative_embedding)
-        contextualized_embeddings = [e.transpose(0, 1) for e in contextualized_embeddings]
         last_layer = contextualized_embeddings[-1]
         contextualized_embeddings = [contextualized_embeddings[0]] + [
             contextualized_embeddings[i] - contextualized_embeddings[i - 1]

         return bucket_pos
     def forward(self, hidden_states, attention_mask, relative_embedding):
+        batch_size, key_len, _ = hidden_states.size()
         query_len = key_len
+        # Recompute position_indices if sequence length exceeds the precomputed size
         if self.position_indices.size(0) < query_len:
             position_indices = torch.arange(query_len, dtype=torch.long).unsqueeze(1) \
                 - torch.arange(query_len, dtype=torch.long).unsqueeze(0)
             position_indices = self.config.position_bucket_size - 1 + position_indices
             self.position_indices = position_indices.to(hidden_states.device)
+        # Pre-LN and project query/key/value.
+        hidden_states = self.pre_layer_norm(hidden_states)  # shape: [B, T, D]
+        query, key = self.in_proj_qk(hidden_states).chunk(2, dim=-1)  # shape: [B, T, D]
+        value = self.in_proj_v(hidden_states)  # shape: [B, T, D]
+        # Reshape to [B, num_heads, T, head_size]
+        query = query.reshape(B, T, self.num_heads, self.head_size).transpose(1, 2)  # shape: [B, num_heads, T_q, head_size]
+        key = key.reshape(B, T, self.num_heads, self.head_size).permute(0, 2, 3, 1)  # shape: [B, num_heads, head_size, T_k]
+        value = value.view(B, T, self.num_heads, self.head_size).transpose(1, 2)  # shape: [B, num_heads, T_k, head_size]
+        # Compute relative positional contributions
+        pos = self.in_proj_qk(self.dropout(relative_embedding))  # shape: [2*position_bucket_size - 1, 2D]
+        query_pos, key_pos = pos.view(-1, self.num_heads, 2*self.head_size).chunk(2, dim=2)  # shape: [2*position_bucket_size - 1, num_heads, head_size]
+        query_pos = query_pos.permute(1, 0, 2)  # shape: [num_heads, 2*position_bucket_size - 1, head_size]
+        key_pos = key_pos.permute(1, 0, 2)  # shape: [num_heads, 2*position_bucket_size - 1, head_size]
+        # Scale the keys
+        key = key * self.scale
+        key_pos = key_pos * self.scale
+        # Compute standard content-to-content attention scores
+        attention_c_to_c = torch.matmul(query, key)  # shape: [B, num_heads, T_q, T_k]
+        # Compute content-to-position and position-to-content attention scores
+        position_indices = self.position_indices[:query_len, :key_len].unsqueeze(0).unsqueeze(0).expand(batch_size, self.num_heads, query_len, key_len)
+        attention_c_to_p = torch.matmul(query, key_pos.unsqueeze(0))  # [B, num_heads, T-q, 2*position_bucket_size - 1]
+        attention_p_to_c = torch.matmul(query_pos.unsqueeze(0), key)  # [B, num_heads, 2*position_bucket_size - 1, T_k]
+        attention_c_to_p = attention_c_to_p.gather(3, position_indices)  # shape: [B, num_heads, T_q, T_k]
+        attention_p_to_c = attention_p_to_c.gather(2, position_indices)  # shape: [B, num_heads, T_q, T_k]
+        # Full attention score
+        attention_scores = attention_c_to_c + attention_c_to_p + attention_p_to_c  # shape: [B, num_heads, T_q, T_k]
+        # Masked softmax
+        attention_scores = attention_scores.masked_fill(attention_mask, float('-inf'))  # shape: [B, num_heads, T_q, T_k]
+        attention_probs = F.softmax(attention_scores, dim=-1)  # shape: [B, num_heads, T_q, T_k]
+        # Collect the weighted-averaged values
+        attention_probs = self.dropout(attention_probs)  # shape: [B, num_heads, T_q, T_k]
+        output = torch.matmul(attention_probs, value)  # shape: [B, num_heads, T_q, head_size]
+        output = output.transpose(1, 2).flatten(2, 3)  # shape: [B, T_q, D]
+        output = self.out_proj(output)
+        output = self.post_layer_norm(output)
+        output = self.dropout(output)
+        return output, attention_probs.detach()
 class Embedding(nn.Module):
             attention_mask = ~attention_mask.bool()
         attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+        static_embeddings, relative_embedding = self.embedding(input_ids)
         contextualized_embeddings, attention_probs = self.transformer(static_embeddings, attention_mask, relative_embedding)
         last_layer = contextualized_embeddings[-1]
         contextualized_embeddings = [contextualized_embeddings[0]] + [
             contextualized_embeddings[i] - contextualized_embeddings[i - 1]