InstaDeepAI
/

ChatNT

Text Generation

feature-extraction

Model card Files Files and versions

Yanisadel commited on Apr 4, 2025

Commit

02765e9

·

1 Parent(s): c7e0039

Update chatNT.py

Files changed (1) hide show

chatNT.py +4 -4

chatNT.py CHANGED Viewed

@@ -975,7 +975,7 @@ class TorchGptDecoder(nn.Module):
         self, embeddings: torch.Tensor, attention_mask: torch.Tensor = None
     ) -> torch.Tensor:
         if attention_mask is None:
-            attention_mask = build_causal_attention_mask(1, embeddings.shape[1])
         for layer in self.layers:
             embeddings = layer(embeddings, attention_mask)
@@ -985,7 +985,7 @@ class TorchGptDecoder(nn.Module):
         self, token_ids: torch.Tensor, attention_mask: torch.Tensor = None
     ) -> dict[str, torch.Tensor]:
         if attention_mask is None:
-            attention_mask = build_causal_attention_mask(1, token_ids.shape[1])
         tokens_embeddings = self.token_embed(token_ids)
@@ -1127,7 +1127,7 @@ def get_activation_fn(activation_name: str):  # type: ignore
     return activations.get(activation_name, nn.functional.relu)
-def build_causal_attention_mask(batch_size: int, seq_len: int) -> torch.Tensor:
     """
     Builds a batch of causal masks of shape (batch_size, 1, seq_len, seq_len) to feed
     to an attention layer.
@@ -1139,7 +1139,7 @@ def build_causal_attention_mask(batch_size: int, seq_len: int) -> torch.Tensor:
     Returns:
         Batch of causal masks.
     """
-    mask = torch.ones((batch_size, 1, seq_len, seq_len))
     causal_mask = torch.tril(mask)
     return causal_mask

         self, embeddings: torch.Tensor, attention_mask: torch.Tensor = None
     ) -> torch.Tensor:
         if attention_mask is None:
+            attention_mask = build_causal_attention_mask(1, embeddings.shape[1], device=embeddings.device)
         for layer in self.layers:
             embeddings = layer(embeddings, attention_mask)
         self, token_ids: torch.Tensor, attention_mask: torch.Tensor = None
     ) -> dict[str, torch.Tensor]:
         if attention_mask is None:
+            attention_mask = build_causal_attention_mask(1, token_ids.shape[1], device=token_ids.device)
         tokens_embeddings = self.token_embed(token_ids)
     return activations.get(activation_name, nn.functional.relu)
+def build_causal_attention_mask(batch_size: int, seq_len: int, device: torch.device) -> torch.Tensor:
     """
     Builds a batch of causal masks of shape (batch_size, 1, seq_len, seq_len) to feed
     to an attention layer.
     Returns:
         Batch of causal masks.
     """
+    mask = torch.ones((batch_size, 1, seq_len, seq_len), device=device)
     causal_mask = torch.tril(mask)
     return causal_mask