TuKoResearch
/

AuriStream7B_40Pred_BigAudioDataset_500k

AuriStream.AuriStream

Model card Files Files and versions

klemenk commited on Oct 6, 2025

Commit

f32a41a

·

verified ·

1 Parent(s): d47c2bd

Update modeling_auristream.py

Files changed (1) hide show

modeling_auristream.py +6 -1

modeling_auristream.py CHANGED Viewed

@@ -495,7 +495,7 @@ class CausalSelfAttention(nn.Module):
         q = q.view(B, T, self.n_head, self.head_dim)
         v = v.view(B, T, self.n_head, self.head_dim)
-        k_orig = k.clone()
         if self.rotary is not None:
             cos, sin = self.rotary(q)
@@ -550,6 +550,11 @@ class CausalSelfAttention(nn.Module):
         if v_cache is not None:
             v = torch.cat((v_cache, v), dim=2)
         # manual implementation of attention
         att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
         att = F.softmax(att, dim=-1)

         q = q.view(B, T, self.n_head, self.head_dim)
         v = v.view(B, T, self.n_head, self.head_dim)
+        k_orig = k.clone().transpose(1, 2)
         if self.rotary is not None:
             cos, sin = self.rotary(q)
         if v_cache is not None:
             v = torch.cat((v_cache, v), dim=2)
+        if self.rotary is not None:
+            cos, sin = self.rotary(q)
+            q = apply_rotary_emb(q.transpose(1, 2), cos, sin).transpose(1, 2)
+            k = apply_rotary_emb(k.transpose(1, 2), cos, sin).transpose(1, 2)
         # manual implementation of attention
         att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
         att = F.softmax(att, dim=-1)