ltg
/

norbert4-large

Model card Files Files and versions

lgcharpe commited on Jun 4, 2025

Commit

9e04fd3

·

verified ·

1 Parent(s): 45e963e

Update modeling_gptbert.py

Files changed (1) hide show

modeling_gptbert.py +11 -8

modeling_gptbert.py CHANGED Viewed

@@ -688,7 +688,17 @@ class GptBertPreTrainedModel(PreTrainedModel):
     _supports_flex_attn = False
     def _init_weights(self, module):
-        pass
 class GptBertModel(GptBertPreTrainedModel):
@@ -879,17 +889,11 @@ class Classifier(nn.Module):
         self.emb2vocab.bias.zero_()
     def forward(self, x: torch.Tensor):
-        print(x)
         x = self.pre_norm(x)
-        print(x)
         x = self.dropout(x)
-        print(x)
         x = self.projection(x)
-        print(x)
         x = gelu_new(x)
-        print(x)
         x = self.post_norm(x)
-        print(x)
         return self.emb2vocab(x)
@@ -1043,7 +1047,6 @@ class GptBertForSequenceClassification(GptBertModel):
         sequence_output, contextualized_embeddings, attention_probs = self.get_contextualized_embeddings(input_ids, attention_mask)
         logits = self.head(sequence_output[:, 0, :])
-        print(logits)
         loss = None
         if labels is not None:

     _supports_flex_attn = False
     def _init_weights(self, module):
+        std = math.sqrt(2.0 / (5.0 * self.hidden_size))
+        if isinstance(module, nn.Linear):
+            nn.init.trunc_normal_(module.weight.data, mean=0.0, std=std, a=-2*std, b=2*std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            nn.init.trunc_normal_(module.weight.data, mean=0.0, std=std, a=-2*std, b=2*std)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
 class GptBertModel(GptBertPreTrainedModel):
         self.emb2vocab.bias.zero_()
     def forward(self, x: torch.Tensor):
         x = self.pre_norm(x)
         x = self.dropout(x)
         x = self.projection(x)
         x = gelu_new(x)
         x = self.post_norm(x)
         return self.emb2vocab(x)
         sequence_output, contextualized_embeddings, attention_probs = self.get_contextualized_embeddings(input_ids, attention_mask)
         logits = self.head(sequence_output[:, 0, :])
         loss = None
         if labels is not None: