crumb
/

GLORT2

@@ -235,7 +235,7 @@ class TransformerAttention(nn.Module):
             k = torch.cat((past_key, k), dim=-2)
             v = torch.cat((past_value, v), dim=-2)
-        cos, sin = self.rotary_emb(v, seq_len=v.shape[-2])
         q, k = apply_rotary_pos_emb(q, k, cos, sin, position_ids)
         if use_cache is True:

             k = torch.cat((past_key, k), dim=-2)
             v = torch.cat((past_value, v), dim=-2)
+        cos, sin = self.rotary_emb(v, position_ids)
         q, k = apply_rotary_pos_emb(q, k, cos, sin, position_ids)
         if use_cache is True: