ltg
/

norbert3-base

@@ -133,12 +133,12 @@ class Attention(nn.Module):
         # Recompute position_indices at the beginning or if sequence length exceeds the precomputed size
         if self.position_indices is None or self.position_indices.size(0) < query_len:
-            position_indices = torch.arange(query_len, dtype=torch.long).unsqueeze(1) \
                 - torch.arange(query_len, dtype=torch.long).unsqueeze(0)
-            position_indices = self.make_log_bucket_position(position_indices, self.config.position_bucket_size, 512)
-            position_indices = self.config.position_bucket_size - 1 + position_indices
         if self.position_indices.device != hidden_states.device:
-            self.position_indices = position_indices.to(hidden_states.device)
         # Pre-LN and project query/key/value.
         hidden_states = self.pre_layer_norm(hidden_states)  # shape: [B, T, D]

         # Recompute position_indices at the beginning or if sequence length exceeds the precomputed size
         if self.position_indices is None or self.position_indices.size(0) < query_len:
+            self.position_indices = torch.arange(query_len, dtype=torch.long).unsqueeze(1) \
                 - torch.arange(query_len, dtype=torch.long).unsqueeze(0)
+            self.position_indices = self.make_log_bucket_position(self.position_indices, self.config.position_bucket_size, 512)
+            self.position_indices = self.config.position_bucket_size - 1 + self.position_indices
         if self.position_indices.device != hidden_states.device:
+            self.position_indices = self.position_indices.to(hidden_states.device)
         # Pre-LN and project query/key/value.
         hidden_states = self.pre_layer_norm(hidden_states)  # shape: [B, T, D]