some fixes and suggestions

Browse files

Signed-off-by: Meow <ongjackm@gmail.com>

Files changed (5) hide show

embedding.py +2 -2
mha.py +6 -3
mlp.py +2 -2
modeling_lora.py +5 -3
modeling_xlm_roberta.py +2 -1

embedding.py CHANGED Viewed

@@ -48,7 +48,7 @@ class XLMRobertaEmbeddings(nn.Module):
         """
         batch_size, seqlen = input_ids.shape
         if adapter_mask is not None:
-            unique_tasks = torch.unique(adapter_mask).tolist()
             embedding_dtype = next(self.word_embeddings.parameters()).dtype
             embeddings = torch.empty(*input_ids.shape, self.word_embeddings.embedding_dim,
                                             dtype=embedding_dtype, device=input_ids.device)
@@ -71,7 +71,7 @@ class XLMRobertaEmbeddings(nn.Module):
                 token_type_ids = torch.zeros(seqlen, dtype=torch.long, device=input_ids.device)
             if adapter_mask is not None:
-                unique_tasks = torch.unique(adapter_mask).tolist()
                 for task_id in unique_tasks:
                     task_token_type_embeddings = self.token_type_embeddings(token_type_ids, task_id=task_id)
                     task_indices = (adapter_mask == task_id).nonzero(as_tuple=True)[0]

         """
         batch_size, seqlen = input_ids.shape
         if adapter_mask is not None:
+            unique_tasks = torch.unique(adapter_mask)
             embedding_dtype = next(self.word_embeddings.parameters()).dtype
             embeddings = torch.empty(*input_ids.shape, self.word_embeddings.embedding_dim,
                                             dtype=embedding_dtype, device=input_ids.device)
                 token_type_ids = torch.zeros(seqlen, dtype=torch.long, device=input_ids.device)
             if adapter_mask is not None:
+                unique_tasks = torch.unique(adapter_mask)
                 for task_id in unique_tasks:
                     task_token_type_embeddings = self.token_type_embeddings(token_type_ids, task_id=task_id)
                     task_indices = (adapter_mask == task_id).nonzero(as_tuple=True)[0]

mha.py CHANGED Viewed

@@ -647,7 +647,7 @@ class MHA(nn.Module):
             assert x_kv is None and mixer_subset is None
             if cu_adapter_mask is not None:
-                unique_tasks = torch.unique(cu_adapter_mask).tolist()
                 qkv_dtype = next(self.Wqkv.parameters()).dtype
                 qkv = torch.empty(x.shape[0], self.Wqkv.out_features,
                                          dtype=qkv_dtype, device=x.device)
@@ -663,7 +663,10 @@ class MHA(nn.Module):
                 if not self.return_residual:
                     qkv = self.Wqkv(x)
                 else:
-                    qkv, x = self.Wqkv(x)
             if self.dwconv:
                 qkv = rearrange(
@@ -752,7 +755,7 @@ class MHA(nn.Module):
         inp = rearrange(context, "... h d -> ... (h d)")
         if cu_adapter_mask is not None:
-            unique_tasks = torch.unique(cu_adapter_mask).tolist()
             out_dtype = next(self.out_proj.parameters()).dtype
             out = torch.empty(inp.shape[0], self.out_proj.out_features,
                                    dtype=out_dtype, device=inp.device)

             assert x_kv is None and mixer_subset is None
             if cu_adapter_mask is not None:
+                unique_tasks = torch.unique(cu_adapter_mask)
                 qkv_dtype = next(self.Wqkv.parameters()).dtype
                 qkv = torch.empty(x.shape[0], self.Wqkv.out_features,
                                          dtype=qkv_dtype, device=x.device)
                 if not self.return_residual:
                     qkv = self.Wqkv(x)
                 else:
+                    if hasattr(self.Wqkv, 'parametrizations'):
+                        qkv, x = self.Wqkv(x, residual=True)
+                    else:
+                        qkv, x = self.Wqkv(x)
             if self.dwconv:
                 qkv = rearrange(
         inp = rearrange(context, "... h d -> ... (h d)")
         if cu_adapter_mask is not None:
+            unique_tasks = torch.unique(cu_adapter_mask)
             out_dtype = next(self.out_proj.parameters()).dtype
             out = torch.empty(inp.shape[0], self.out_proj.out_features,
                                    dtype=out_dtype, device=inp.device)

mlp.py CHANGED Viewed

@@ -49,7 +49,7 @@ class Mlp(nn.Module):
     def forward(self, x, cu_adapter_mask=None):
         if cu_adapter_mask is not None:
-            unique_tasks = torch.unique(cu_adapter_mask).tolist()
             fc1_dtype = next(self.fc1.parameters()).dtype
             y = torch.empty(x.shape[0], self.fc1.out_features,
                               dtype=fc1_dtype, device=x.device)
@@ -64,7 +64,7 @@ class Mlp(nn.Module):
         y = self.activation(y)
         if cu_adapter_mask is not None:
-            unique_tasks = torch.unique(cu_adapter_mask).tolist()
             fc2_dtype = next(self.fc2.parameters()).dtype
             out = torch.empty(y.shape[0], self.fc2.out_features,
                               dtype=fc2_dtype, device=y.device)

     def forward(self, x, cu_adapter_mask=None):
         if cu_adapter_mask is not None:
+            unique_tasks = torch.unique(cu_adapter_mask)
             fc1_dtype = next(self.fc1.parameters()).dtype
             y = torch.empty(x.shape[0], self.fc1.out_features,
                               dtype=fc1_dtype, device=x.device)
         y = self.activation(y)
         if cu_adapter_mask is not None:
+            unique_tasks = torch.unique(cu_adapter_mask)
             fc2_dtype = next(self.fc2.parameters()).dtype
             out = torch.empty(y.shape[0], self.fc2.out_features,
                               dtype=fc2_dtype, device=y.device)

modeling_lora.py CHANGED Viewed

@@ -355,7 +355,9 @@ class XLMRobertaLoRA(XLMRobertaPreTrainedModel):
                 f"Supported tasks are: {', '.join(self.config.lora_adaptations)}."
                 f"Alternatively, don't pass the `task_type` argument to disable LoRA."
             )
-        task_id = self._adaptation_map[task_type]
-        num_examples = 1 if isinstance(sentences, str) else len(sentences)
-        adapter_mask = torch.full((num_examples,), task_id, dtype=torch.int32)
         return self.roberta.encode(sentences, *args, adapter_mask=adapter_mask, **kwargs)

                 f"Supported tasks are: {', '.join(self.config.lora_adaptations)}."
                 f"Alternatively, don't pass the `task_type` argument to disable LoRA."
             )
+        adapter_mask = None
+        if task_type:
+            task_id = self._adaptation_map[task_type]
+            num_examples = 1 if isinstance(sentences, str) else len(sentences)
+            adapter_mask = torch.full((num_examples,), task_id, dtype=torch.int32, device=self.device)
         return self.roberta.encode(sentences, *args, adapter_mask=adapter_mask, **kwargs)

modeling_xlm_roberta.py CHANGED Viewed

@@ -314,7 +314,7 @@ class XLMRobertaPooler(nn.Module):
         # to the first token.
         first_token_tensor = hidden_states[:, 0] if pool else hidden_states
         if adapter_mask is not None:
-            unique_tasks = torch.unique(adapter_mask).tolist()
             pool_dtype = next(self.dense.parameters()).dtype
             pooled_output = torch.empty(first_token_tensor.shape[0], self.dense.out_features,
                                             dtype=pool_dtype, device=first_token_tensor.device)
@@ -465,6 +465,7 @@ class XLMRobertaModel(XLMRobertaPreTrainedModel):
         normalize_embeddings: bool = False,
         truncate_dim: Optional[int] = None,
         adapter_mask: Optional[torch.Tensor] = None,
         **tokenizer_kwargs,
     ) -> Union[List[torch.Tensor], np.ndarray, torch.Tensor]:
         """

         # to the first token.
         first_token_tensor = hidden_states[:, 0] if pool else hidden_states
         if adapter_mask is not None:
+            unique_tasks = torch.unique(adapter_mask)
             pool_dtype = next(self.dense.parameters()).dtype
             pooled_output = torch.empty(first_token_tensor.shape[0], self.dense.out_features,
                                             dtype=pool_dtype, device=first_token_tensor.device)
         normalize_embeddings: bool = False,
         truncate_dim: Optional[int] = None,
         adapter_mask: Optional[torch.Tensor] = None,
+        task_type: Optional[str] = None,
         **tokenizer_kwargs,
     ) -> Union[List[torch.Tensor], np.ndarray, torch.Tensor]:
         """