fix: device

Signed-off-by: Meow <ongjackm@gmail.com>

Files changed (4) hide show

embedding.py CHANGED Viewed

@@ -51,7 +51,7 @@ class XLMRobertaEmbeddings(nn.Module):
             unique_tasks = torch.unique(adapter_mask).tolist()
             embedding_dtype = next(self.word_embeddings.parameters()).dtype
             embeddings = torch.empty(*input_ids.shape, self.word_embeddings.embedding_dim,
-                                            dtype=embedding_dtype).to(input_ids.device)
             for task_id in unique_tasks:
                 task_indices = (adapter_mask == task_id).nonzero(as_tuple=True)[0]
                 task_input_ids = input_ids[task_indices]

             unique_tasks = torch.unique(adapter_mask).tolist()
             embedding_dtype = next(self.word_embeddings.parameters()).dtype
             embeddings = torch.empty(*input_ids.shape, self.word_embeddings.embedding_dim,
+                                            dtype=embedding_dtype, device=input_ids.device)
             for task_id in unique_tasks:
                 task_indices = (adapter_mask == task_id).nonzero(as_tuple=True)[0]
                 task_input_ids = input_ids[task_indices]

mha.py CHANGED Viewed

@@ -650,7 +650,7 @@ class MHA(nn.Module):
                 unique_tasks = torch.unique(cu_adapter_mask).tolist()
                 qkv_dtype = next(self.Wqkv.parameters()).dtype
                 qkv = torch.empty(x.shape[0], self.Wqkv.out_features,
-                                         dtype=qkv_dtype).to(x.device)
                 for task_id in unique_tasks:
                     task_indices = (cu_adapter_mask == task_id).nonzero(as_tuple=True)[0]
                     task_tensor = x[task_indices]
@@ -755,7 +755,7 @@ class MHA(nn.Module):
             unique_tasks = torch.unique(cu_adapter_mask).tolist()
             out_dtype = next(self.out_proj.parameters()).dtype
             out = torch.empty(inp.shape[0], self.out_proj.out_features,
-                                   dtype=out_dtype).to(inp.device)
             for task_id in unique_tasks:
                 task_indices = (cu_adapter_mask == task_id).nonzero(as_tuple=True)[0]
                 task_tensor = inp[task_indices]

                 unique_tasks = torch.unique(cu_adapter_mask).tolist()
                 qkv_dtype = next(self.Wqkv.parameters()).dtype
                 qkv = torch.empty(x.shape[0], self.Wqkv.out_features,
+                                         dtype=qkv_dtype, device=x.device)
                 for task_id in unique_tasks:
                     task_indices = (cu_adapter_mask == task_id).nonzero(as_tuple=True)[0]
                     task_tensor = x[task_indices]
             unique_tasks = torch.unique(cu_adapter_mask).tolist()
             out_dtype = next(self.out_proj.parameters()).dtype
             out = torch.empty(inp.shape[0], self.out_proj.out_features,
+                                   dtype=out_dtype, device=inp.device)
             for task_id in unique_tasks:
                 task_indices = (cu_adapter_mask == task_id).nonzero(as_tuple=True)[0]
                 task_tensor = inp[task_indices]

mlp.py CHANGED Viewed

@@ -52,7 +52,7 @@ class Mlp(nn.Module):
             unique_tasks = torch.unique(cu_adapter_mask).tolist()
             fc1_dtype = next(self.fc1.parameters()).dtype
             y = torch.empty(x.shape[0], self.fc1.out_features,
-                              dtype=fc1_dtype).to(x.device)
             for task_id in unique_tasks:
                 task_indices = (cu_adapter_mask == task_id).nonzero(as_tuple=True)[0]
                 task_tensor = x[task_indices]
@@ -67,7 +67,7 @@ class Mlp(nn.Module):
             unique_tasks = torch.unique(cu_adapter_mask).tolist()
             fc2_dtype = next(self.fc2.parameters()).dtype
             out = torch.empty(y.shape[0], self.fc2.out_features,
-                              dtype=fc2_dtype).to(y.device)
             for task_id in unique_tasks:
                 task_indices = (cu_adapter_mask == task_id).nonzero(as_tuple=True)[0]
                 task_tensor = y[task_indices]

             unique_tasks = torch.unique(cu_adapter_mask).tolist()
             fc1_dtype = next(self.fc1.parameters()).dtype
             y = torch.empty(x.shape[0], self.fc1.out_features,
+                              dtype=fc1_dtype, device=x.device)
             for task_id in unique_tasks:
                 task_indices = (cu_adapter_mask == task_id).nonzero(as_tuple=True)[0]
                 task_tensor = x[task_indices]
             unique_tasks = torch.unique(cu_adapter_mask).tolist()
             fc2_dtype = next(self.fc2.parameters()).dtype
             out = torch.empty(y.shape[0], self.fc2.out_features,
+                              dtype=fc2_dtype, device=y.device)
             for task_id in unique_tasks:
                 task_indices = (cu_adapter_mask == task_id).nonzero(as_tuple=True)[0]
                 task_tensor = y[task_indices]

modeling_xlm_roberta.py CHANGED Viewed

@@ -317,7 +317,7 @@ class XLMRobertaPooler(nn.Module):
             unique_tasks = torch.unique(adapter_mask).tolist()
             pool_dtype = next(self.dense.parameters()).dtype
             pooled_output = torch.empty(first_token_tensor.shape[0], self.dense.out_features,
-                                            dtype=pool_dtype).to(first_token_tensor.device)
             for task_id in unique_tasks:
                 task_indices = (adapter_mask == task_id).nonzero(as_tuple=True)[0]
                 task_first_token_tensor = first_token_tensor[task_indices]

             unique_tasks = torch.unique(adapter_mask).tolist()
             pool_dtype = next(self.dense.parameters()).dtype
             pooled_output = torch.empty(first_token_tensor.shape[0], self.dense.out_features,
+                                            dtype=pool_dtype, device=first_token_tensor.device)
             for task_id in unique_tasks:
                 task_indices = (adapter_mask == task_id).nonzero(as_tuple=True)[0]
                 task_first_token_tensor = first_token_tensor[task_indices]