ltg
/

norbert3-base

@@ -119,11 +119,7 @@ class Attention(nn.Module):
         self.pre_layer_norm = nn.LayerNorm(config.hidden_size, config.layer_norm_eps, elementwise_affine=False)
         self.post_layer_norm = nn.LayerNorm(config.hidden_size, config.layer_norm_eps, elementwise_affine=True)
-        position_indices = torch.arange(config.max_position_embeddings, dtype=torch.long).unsqueeze(1) \
-            - torch.arange(config.max_position_embeddings, dtype=torch.long).unsqueeze(0)
-        position_indices = self.make_log_bucket_position(position_indices, config.position_bucket_size, config.max_position_embeddings)
-        position_indices = config.position_bucket_size - 1 + position_indices
-        self.register_buffer("position_indices", position_indices.contiguous(), persistent=False)
         self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
         self.scale = 1.0 / math.sqrt(3 * self.head_size)
@@ -140,8 +136,8 @@ class Attention(nn.Module):
         batch_size, key_len, _ = hidden_states.size()
         query_len = key_len
-        # Recompute position_indices if sequence length exceeds the precomputed size
-        if self.position_indices.size(0) < query_len:
             position_indices = torch.arange(query_len, dtype=torch.long).unsqueeze(1) \
                 - torch.arange(query_len, dtype=torch.long).unsqueeze(0)
             position_indices = self.make_log_bucket_position(position_indices, self.config.position_bucket_size, 512)
@@ -223,6 +219,7 @@ class NorbertPreTrainedModel(PreTrainedModel):
     base_model_prefix = "norbert3"
     supports_gradient_checkpointing = True
     _tied_weights_keys = {}
     def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, Encoder):

         self.pre_layer_norm = nn.LayerNorm(config.hidden_size, config.layer_norm_eps, elementwise_affine=False)
         self.post_layer_norm = nn.LayerNorm(config.hidden_size, config.layer_norm_eps, elementwise_affine=True)
+        self.position_indices = None
         self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
         self.scale = 1.0 / math.sqrt(3 * self.head_size)
         batch_size, key_len, _ = hidden_states.size()
         query_len = key_len
+        # Recompute position_indices at the beginning or if sequence length exceeds the precomputed size
+        if self.position_indices is None or self.position_indices.size(0) < query_len:
             position_indices = torch.arange(query_len, dtype=torch.long).unsqueeze(1) \
                 - torch.arange(query_len, dtype=torch.long).unsqueeze(0)
             position_indices = self.make_log_bucket_position(position_indices, self.config.position_bucket_size, 512)
     base_model_prefix = "norbert3"
     supports_gradient_checkpointing = True
     _tied_weights_keys = {}
+    _keys_to_ignore_on_load_unexpected = [r".*position_indices.*"]
     def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, Encoder):