ltg
/

norbert3-base

@@ -22,11 +22,6 @@ class Encoder(nn.Module):
     def __init__(self, config, activation_checkpointing=False):
         super().__init__()
         self.layers = nn.ModuleList([EncoderLayer(config) for _ in range(config.num_hidden_layers)])
-        for i, layer in enumerate(self.layers):
-            layer.mlp.mlp[1].weight.data *= math.sqrt(1.0 / (2.0 * (1 + i)))
-            layer.mlp.mlp[-2].weight.data *= math.sqrt(1.0 / (2.0 * (1 + i)))
         self.activation_checkpointing = activation_checkpointing
     def forward(self, hidden_states, attention_mask, relative_embedding):
@@ -142,6 +137,7 @@ class Attention(nn.Module):
                 - torch.arange(query_len, dtype=torch.long).unsqueeze(0)
             position_indices = self.make_log_bucket_position(position_indices, self.config.position_bucket_size, 512)
             position_indices = self.config.position_bucket_size - 1 + position_indices
             self.position_indices = position_indices.to(hidden_states.device)
         # Pre-LN and project query/key/value.
@@ -320,10 +316,10 @@ class NorbertForMaskedLM(NorbertModel):
         self.post_init()
     def get_output_embeddings(self):
-        return self.classifier.nonlinearity[-1].weight
     def set_output_embeddings(self, new_embeddings):
-        self.classifier.nonlinearity[-1].weight = new_embeddings
     def forward(
         self,

     def __init__(self, config, activation_checkpointing=False):
         super().__init__()
         self.layers = nn.ModuleList([EncoderLayer(config) for _ in range(config.num_hidden_layers)])
         self.activation_checkpointing = activation_checkpointing
     def forward(self, hidden_states, attention_mask, relative_embedding):
                 - torch.arange(query_len, dtype=torch.long).unsqueeze(0)
             position_indices = self.make_log_bucket_position(position_indices, self.config.position_bucket_size, 512)
             position_indices = self.config.position_bucket_size - 1 + position_indices
+        if self.position_indices.device != hidden_states.device:
             self.position_indices = position_indices.to(hidden_states.device)
         # Pre-LN and project query/key/value.
         self.post_init()
     def get_output_embeddings(self):
+        return self.classifier.nonlinearity[-1]
     def set_output_embeddings(self, new_embeddings):
+        self.classifier.nonlinearity[-1] = new_embeddings
     def forward(
         self,