jinaai
/

jina-bert-flash-implementation

Markus28 commited on Mar 1, 2024

Commit

95ca1a8

1 Parent(s): 463061d

fix: try to skip initialization of task type embeddings

Files changed (1) hide show

modeling_bert.py CHANGED Viewed

@@ -145,7 +145,7 @@ def _init_weights(module, initializer_range=0.02):
         nn.init.normal_(module.weight, std=initializer_range)
         if module.bias is not None:
             nn.init.zeros_(module.bias)
-    elif isinstance(module, nn.Embedding):
         nn.init.normal_(module.weight, std=initializer_range)
         if module.padding_idx is not None:
             nn.init.zeros_(module.weight[module.padding_idx])
@@ -346,12 +346,14 @@ class BertModel(BertPreTrainedModel):
         self.pooler = BertPooler(config) if add_pooling_layer else None
         self.task_type_embeddings = nn.Embedding(config.num_tasks, config.hidden_size)
-        self.apply(partial(_init_weights, initializer_range=config.initializer_range))
         # We now initialize the task embeddings to 0; We do not use task types during
         # pretraining. When we start using task types during embedding training,
         # we want the model to behave exactly as in pretraining (i.e. task types
         # have no effect).
         nn.init.zeros_(self.task_type_embeddings.weight)
     def forward(
         self,

         nn.init.normal_(module.weight, std=initializer_range)
         if module.bias is not None:
             nn.init.zeros_(module.bias)
+    elif isinstance(module, nn.Embedding) and not module.skip_init:
         nn.init.normal_(module.weight, std=initializer_range)
         if module.padding_idx is not None:
             nn.init.zeros_(module.weight[module.padding_idx])
         self.pooler = BertPooler(config) if add_pooling_layer else None
         self.task_type_embeddings = nn.Embedding(config.num_tasks, config.hidden_size)
         # We now initialize the task embeddings to 0; We do not use task types during
         # pretraining. When we start using task types during embedding training,
         # we want the model to behave exactly as in pretraining (i.e. task types
         # have no effect).
         nn.init.zeros_(self.task_type_embeddings.weight)
+        self.task_type_embeddings.skip_init = True
+        # The following code should skip the embeddings layer
+        self.apply(partial(_init_weights, initializer_range=config.initializer_range))
     def forward(
         self,