InstaDeepAI
/

ChatNT

Text Generation

feature-extraction

Model card Files Files and versions

Yanisadel commited on Jul 7, 2025

Commit

112bf64

·

verified ·

1 Parent(s): 6d6a20f

Update chatNT.py

Files changed (1) hide show

chatNT.py +4 -2

chatNT.py CHANGED Viewed

@@ -925,7 +925,11 @@ class TorchGptGroupedQueryAttention(nn.Module):
             )
         attention_weights = nn.functional.softmax(attention_logits, dim=-1)
         values = torch.einsum("bhtT,bThd->bthd", attention_weights, values)
         values = values.contiguous().view(batch_size, seq_len, -1)
@@ -1334,8 +1338,6 @@ class MultiHeadAttention(nn.Module):
         else:
             attention_weights = F.softmax(attention_weights, dim=-1)
-        print(f"Attention weights : {attention_weights.dtype}")
-        print(f"Value heads  : {value_heads.dtype}")
         value_out = torch.einsum(
             "...htT, ...Thd->...thd", attention_weights, value_heads
         )

             )
         attention_weights = nn.functional.softmax(attention_logits, dim=-1)
+        attention_weights = attention_weights.to(values.dtype)
+        print(f"Attention weights type : ", attention_weights.dtype)
+        print(f"Values type : ", values.dtype)
         values = torch.einsum("bhtT,bThd->bthd", attention_weights, values)
         values = values.contiguous().view(batch_size, seq_len, -1)
         else:
             attention_weights = F.softmax(attention_weights, dim=-1)
         value_out = torch.einsum(
             "...htT, ...Thd->...thd", attention_weights, value_heads
         )