ltg
/

norbert3-base

@@ -22,6 +22,11 @@ class Encoder(nn.Module):
     def __init__(self, config, activation_checkpointing=False):
         super().__init__()
         self.layers = nn.ModuleList([EncoderLayer(config) for _ in range(config.num_hidden_layers)])
         self.activation_checkpointing = activation_checkpointing
     def forward(self, hidden_states, attention_mask, relative_embedding):
@@ -114,7 +119,11 @@ class Attention(nn.Module):
         self.pre_layer_norm = nn.LayerNorm(config.hidden_size, config.layer_norm_eps, elementwise_affine=False)
         self.post_layer_norm = nn.LayerNorm(config.hidden_size, config.layer_norm_eps, elementwise_affine=True)
-        self.position_indices = None
         self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
         self.scale = 1.0 / math.sqrt(3 * self.head_size)
@@ -131,14 +140,13 @@ class Attention(nn.Module):
         batch_size, key_len, _ = hidden_states.size()
         query_len = key_len
-        # Recompute position_indices at the beginning or if sequence length exceeds the precomputed size
-        if self.position_indices is None or self.position_indices.size(0) < query_len:
-            self.position_indices = torch.arange(query_len, dtype=torch.long).unsqueeze(1) \
                 - torch.arange(query_len, dtype=torch.long).unsqueeze(0)
-            self.position_indices = self.make_log_bucket_position(self.position_indices, self.config.position_bucket_size, 512)
-            self.position_indices = self.config.position_bucket_size - 1 + self.position_indices
-        if self.position_indices.device != hidden_states.device:
-            self.position_indices = self.position_indices.to(hidden_states.device)
         # Pre-LN and project query/key/value.
         hidden_states = self.pre_layer_norm(hidden_states)  # shape: [B, T, D]
@@ -214,8 +222,6 @@ class NorbertPreTrainedModel(PreTrainedModel):
     config_class = NorbertConfig
     base_model_prefix = "norbert3"
     supports_gradient_checkpointing = True
-    _tied_weights_keys = {}
-    _keys_to_ignore_on_load_unexpected = [r".*position_indices.*"]
     def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, Encoder):
@@ -224,12 +230,15 @@ class NorbertPreTrainedModel(PreTrainedModel):
     def _init_weights(self, module):
         std = math.sqrt(2.0 / (5.0 * self.hidden_size))
-        if isinstance(module, nn.Linear) or isinstance(module, nn.Embedding):
-            nn.init.trunc_normal_(module.weight, mean=0.0, std=std, a=-2*std, b=2*std)
-        elif isinstance(module, nn.LayerNorm) and module.weight is not None:
-            module.weight.fill_(1.0)
-        if hasattr(module, "bias") and module.bias is not None:
-            module.bias.zero_()
 class NorbertModel(NorbertPreTrainedModel):
@@ -242,8 +251,6 @@ class NorbertModel(NorbertPreTrainedModel):
         self.transformer = Encoder(config, activation_checkpointing=gradient_checkpointing)
         self.classifier = MaskClassifier(config, self.embedding.word_embedding.weight) if add_mlm_layer else None
-        self.post_init()
     def get_input_embeddings(self):
         return self.embedding.word_embedding
@@ -308,18 +315,16 @@ class NorbertModel(NorbertPreTrainedModel):
 class NorbertForMaskedLM(NorbertModel):
-    _keys_to_ignore_on_load_unexpected = ["head", r".*position_indices.*"]
-    _tied_weights_keys = {"classifier.nonlinearity.5.weight": "embedding.word_embedding.weight"}
     def __init__(self, config, **kwargs):
         super().__init__(config, add_mlm_layer=True, **kwargs)
-        self.post_init()
     def get_output_embeddings(self):
-        return self.classifier.nonlinearity[-1]
     def set_output_embeddings(self, new_embeddings):
-        self.classifier.nonlinearity[-1] = new_embeddings
     def forward(
         self,
@@ -381,14 +386,13 @@ class Classifier(nn.Module):
 class NorbertForSequenceClassification(NorbertModel):
-    _keys_to_ignore_on_load_unexpected = ["classifier", r".*position_indices.*"]
     def __init__(self, config, **kwargs):
         super().__init__(config, add_mlm_layer=False, **kwargs)
         self.num_labels = config.num_labels
         self.head = Classifier(config, self.num_labels)
-        self.post_init()
     def forward(
         self,
@@ -447,14 +451,13 @@ class NorbertForSequenceClassification(NorbertModel):
 class NorbertForTokenClassification(NorbertModel):
-    _keys_to_ignore_on_load_unexpected = ["classifier", r".*position_indices.*"]
     def __init__(self, config, **kwargs):
         super().__init__(config, add_mlm_layer=False, **kwargs)
         self.num_labels = config.num_labels
         self.head = Classifier(config, self.num_labels)
-        self.post_init()
     def forward(
         self,
@@ -495,14 +498,13 @@ class NorbertForTokenClassification(NorbertModel):
 class NorbertForQuestionAnswering(NorbertModel):
-    _keys_to_ignore_on_load_unexpected = ["classifier", r".*position_indices.*"]
     def __init__(self, config, **kwargs):
         super().__init__(config, add_mlm_layer=False, **kwargs)
         self.num_labels = config.num_labels
         self.head = Classifier(config, self.num_labels)
-        self.post_init()
     def forward(
         self,
@@ -563,14 +565,13 @@ class NorbertForQuestionAnswering(NorbertModel):
 class NorbertForMultipleChoice(NorbertModel):
-    _keys_to_ignore_on_load_unexpected = ["classifier", r".*position_indices.*"]
     def __init__(self, config, **kwargs):
         super().__init__(config, add_mlm_layer=False, **kwargs)
         self.num_labels = getattr(config, "num_labels", 2)
         self.head = Classifier(config, self.num_labels)
-        self.post_init()
     def forward(
         self,

     def __init__(self, config, activation_checkpointing=False):
         super().__init__()
         self.layers = nn.ModuleList([EncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        for i, layer in enumerate(self.layers):
+            layer.mlp.mlp[1].weight.data *= math.sqrt(1.0 / (2.0 * (1 + i)))
+            layer.mlp.mlp[-2].weight.data *= math.sqrt(1.0 / (2.0 * (1 + i)))
         self.activation_checkpointing = activation_checkpointing
     def forward(self, hidden_states, attention_mask, relative_embedding):
         self.pre_layer_norm = nn.LayerNorm(config.hidden_size, config.layer_norm_eps, elementwise_affine=False)
         self.post_layer_norm = nn.LayerNorm(config.hidden_size, config.layer_norm_eps, elementwise_affine=True)
+        position_indices = torch.arange(config.max_position_embeddings, dtype=torch.long).unsqueeze(1) \
+            - torch.arange(config.max_position_embeddings, dtype=torch.long).unsqueeze(0)
+        position_indices = self.make_log_bucket_position(position_indices, config.position_bucket_size, config.max_position_embeddings)
+        position_indices = config.position_bucket_size - 1 + position_indices
+        self.register_buffer("position_indices", position_indices.contiguous(), persistent=False)
         self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
         self.scale = 1.0 / math.sqrt(3 * self.head_size)
         batch_size, key_len, _ = hidden_states.size()
         query_len = key_len
+        # Recompute position_indices if sequence length exceeds the precomputed size
+        if self.position_indices.size(0) < query_len:
+            position_indices = torch.arange(query_len, dtype=torch.long).unsqueeze(1) \
                 - torch.arange(query_len, dtype=torch.long).unsqueeze(0)
+            position_indices = self.make_log_bucket_position(position_indices, self.config.position_bucket_size, 512)
+            position_indices = self.config.position_bucket_size - 1 + position_indices
+            self.position_indices = position_indices.to(hidden_states.device)
         # Pre-LN and project query/key/value.
         hidden_states = self.pre_layer_norm(hidden_states)  # shape: [B, T, D]
     config_class = NorbertConfig
     base_model_prefix = "norbert3"
     supports_gradient_checkpointing = True
     def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, Encoder):
     def _init_weights(self, module):
         std = math.sqrt(2.0 / (5.0 * self.hidden_size))
+        if isinstance(module, nn.Linear):
+            nn.init.trunc_normal_(module.weight.data, mean=0.0, std=std, a=-2*std, b=2*std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            nn.init.trunc_normal_(module.weight.data, mean=0.0, std=std, a=-2*std, b=2*std)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
 class NorbertModel(NorbertPreTrainedModel):
         self.transformer = Encoder(config, activation_checkpointing=gradient_checkpointing)
         self.classifier = MaskClassifier(config, self.embedding.word_embedding.weight) if add_mlm_layer else None
     def get_input_embeddings(self):
         return self.embedding.word_embedding
 class NorbertForMaskedLM(NorbertModel):
+    _keys_to_ignore_on_load_unexpected = ["head"]
     def __init__(self, config, **kwargs):
         super().__init__(config, add_mlm_layer=True, **kwargs)
     def get_output_embeddings(self):
+        return self.classifier.nonlinearity[-1].weight
     def set_output_embeddings(self, new_embeddings):
+        self.classifier.nonlinearity[-1].weight = new_embeddings
     def forward(
         self,
 class NorbertForSequenceClassification(NorbertModel):
+    _keys_to_ignore_on_load_unexpected = ["classifier"]
     def __init__(self, config, **kwargs):
         super().__init__(config, add_mlm_layer=False, **kwargs)
         self.num_labels = config.num_labels
         self.head = Classifier(config, self.num_labels)
     def forward(
         self,
 class NorbertForTokenClassification(NorbertModel):
+    _keys_to_ignore_on_load_unexpected = ["classifier"]
     def __init__(self, config, **kwargs):
         super().__init__(config, add_mlm_layer=False, **kwargs)
         self.num_labels = config.num_labels
         self.head = Classifier(config, self.num_labels)
     def forward(
         self,
 class NorbertForQuestionAnswering(NorbertModel):
+    _keys_to_ignore_on_load_unexpected = ["classifier"]
     def __init__(self, config, **kwargs):
         super().__init__(config, add_mlm_layer=False, **kwargs)
         self.num_labels = config.num_labels
         self.head = Classifier(config, self.num_labels)
     def forward(
         self,
 class NorbertForMultipleChoice(NorbertModel):
+    _keys_to_ignore_on_load_unexpected = ["classifier"]
     def __init__(self, config, **kwargs):
         super().__init__(config, add_mlm_layer=False, **kwargs)
         self.num_labels = getattr(config, "num_labels", 2)
         self.head = Classifier(config, self.num_labels)
     def forward(
         self,