adalbertojunior
/

mrpt

@@ -584,7 +584,7 @@ class RobertaModel(RobertaPreTrainedModel):
                 to the last attention block of shape [batch_size, sequence_length, hidden_size],
         `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
             classifier pretrained on top of the hidden state associated to the first character of the
-            input (`CLS`) to train on the Next-Sentence task (see BERT's paper).
     Example usage:
     ```python
     # Already been converted into WordPiece token ids
@@ -747,7 +747,7 @@ class RobertaForMaskedLM(RobertaPreTrainedModel):
         self.post_init()
     @classmethod
-    def from_composer(cls,
                       pretrained_checkpoint,
                       state_dict=None,
                       cache_dir=None,
@@ -756,7 +756,7 @@ class RobertaForMaskedLM(RobertaPreTrainedModel):
                       *inputs,
                       **kwargs):
         """Load from pre-trained."""
-        model = cls(config, *inputs, **kwargs)
         if from_tf:
             raise ValueError(
                 'Mosaic BERT does not support loading TensorFlow weights.')
@@ -779,10 +779,10 @@ class RobertaForMaskedLM(RobertaPreTrainedModel):
         return model
     def get_output_embeddings(self):
-        return self.cls.predictions.decoder
     def set_output_embeddings(self, new_embeddings):
-        self.cls.predictions.decoder = new_embeddings
     def forward(
         self,
@@ -836,7 +836,7 @@ class RobertaForMaskedLM(RobertaPreTrainedModel):
         )
         sequence_output = outputs[0]
-        prediction_scores = self.cls(sequence_output)
         loss = None
         if labels is not None:
@@ -916,7 +916,7 @@ class RobertaForSequenceClassification(RobertaPreTrainedModel):
         self.post_init()
     @classmethod
-    def from_composer(cls,
                       pretrained_checkpoint,
                       state_dict=None,
                       cache_dir=None,
@@ -925,7 +925,7 @@ class RobertaForSequenceClassification(RobertaPreTrainedModel):
                       *inputs,
                       **kwargs):
         """Load from pre-trained."""
-        model = cls(config, *inputs, **kwargs)
         if from_tf:
             raise ValueError(
                 'Mosaic BERT does not support loading TensorFlow weights.')

                 to the last attention block of shape [batch_size, sequence_length, hidden_size],
         `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
             classifier pretrained on top of the hidden state associated to the first character of the
+            input (`lm_head`) to train on the Next-Sentence task (see BERT's paper).
     Example usage:
     ```python
     # Already been converted into WordPiece token ids
         self.post_init()
     @classmethod
+    def from_composer(lm_head,
                       pretrained_checkpoint,
                       state_dict=None,
                       cache_dir=None,
                       *inputs,
                       **kwargs):
         """Load from pre-trained."""
+        model = lm_head(config, *inputs, **kwargs)
         if from_tf:
             raise ValueError(
                 'Mosaic BERT does not support loading TensorFlow weights.')
         return model
     def get_output_embeddings(self):
+        return self.lm_head.predictions.decoder
     def set_output_embeddings(self, new_embeddings):
+        self.lm_head.predictions.decoder = new_embeddings
     def forward(
         self,
         )
         sequence_output = outputs[0]
+        prediction_scores = self.lm_head(sequence_output)
         loss = None
         if labels is not None:
         self.post_init()
     @classmethod
+    def from_composer(lm_head,
                       pretrained_checkpoint,
                       state_dict=None,
                       cache_dir=None,
                       *inputs,
                       **kwargs):
         """Load from pre-trained."""
+        model = lm_head(config, *inputs, **kwargs)
         if from_tf:
             raise ValueError(
                 'Mosaic BERT does not support loading TensorFlow weights.')