TuKoResearch
/

AuriStream7B_40Pred_BigAudioDataset_500k

Safetensors

AuriStream.AuriStream

custom_code

Model card Files Files and versions

xet

Community

klemenk commited on Oct 6, 2025

Commit

d09b97c

verified ·

1 Parent(s): d2f818b

Update modeling_auristream.py

Browse files

Files changed (1) hide show

modeling_auristream.py +63 -19

modeling_auristream.py CHANGED Viewed

@@ -495,8 +495,6 @@ class CausalSelfAttention(nn.Module):
         q = q.view(B, T, self.n_head, self.head_dim)
         v = v.view(B, T, self.n_head, self.head_dim)
-        k_orig = k.clone().transpose(1, 2)
         if self.rotary is not None:
             cos, sin = self.rotary(q)
             q = apply_rotary_emb(q, cos, sin)
@@ -531,40 +529,86 @@ class CausalSelfAttention(nn.Module):
         # return key and value caches if requested
         if return_kv:
-            return y, k_orig, v
         return y
-    def kv_cache_forward(self, x, k_cache=None, v_cache=None):
-        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
         # calculate query, key, values for all heads in batch and move head forward to be the batch dim
         q, k, v  = self.c_attn(x).split(self.n_embd, dim=2)
-        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
-        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
-        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
-        # append cached keys and values with new keys and values
         if k_cache is not None:
             k = torch.cat((k_cache, k), dim=2)
         if v_cache is not None:
             v = torch.cat((v_cache, v), dim=2)
-        if self.rotary is not None:
-            cos, sin = self.rotary(q)
-            q = apply_rotary_emb(q, cos, sin)
-            k = apply_rotary_emb(k, cos, sin)
         # manual implementation of attention
         att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
         att = F.softmax(att, dim=-1)
         y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
         y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
         # output projection
         y = self.c_proj(y)
         return y, k, v

         q = q.view(B, T, self.n_head, self.head_dim)
         v = v.view(B, T, self.n_head, self.head_dim)
         if self.rotary is not None:
             cos, sin = self.rotary(q)
             q = apply_rotary_emb(q, cos, sin)
         # return key and value caches if requested
         if return_kv:
+            return y, k, v
         return y
+    # def kv_cache_forward(self, x, k_cache=None, v_cache=None):
+    #     B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
+    #     # calculate query, key, values for all heads in batch and move head forward to be the batch dim
+    #     q, k, v  = self.c_attn(x).split(self.n_embd, dim=2)
+    #     k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
+    #     q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
+    #     v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
+    #     # append cached keys and values with new keys and values
+    #     if k_cache is not None:
+    #         k = torch.cat((k_cache, k), dim=2)
+    #     if v_cache is not None:
+    #         v = torch.cat((v_cache, v), dim=2)
+    #     if self.rotary is not None:
+    #         cos, sin = self.rotary(q)
+    #         q = apply_rotary_emb(q, cos, sin)
+    #         k = apply_rotary_emb(k, cos, sin)
+    #     # manual implementation of attention
+    #     att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+    #     att = F.softmax(att, dim=-1)
+    #     y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
+    #     y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
+    #     # output projection
+    #     y = self.c_proj(y)
+    #     return y, k, v
+    def kv_cache_forward(self, x, k_cache=None, v_cache=None):
+        B, T, C = x.size() # T=1 for single new token
         # calculate query, key, values for all heads in batch and move head forward to be the batch dim
         q, k, v  = self.c_attn(x).split(self.n_embd, dim=2)
+        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, 1, hs)
+        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, 1, hs)
+        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, 1, hs)
+        # Apply RoPE BEFORE concatenation, using correct absolute position
+        if self.rotary is not None:
+            # Determine the position of the new token
+            cache_len = k_cache.shape[2] if k_cache is not None else 0
+            # Create a dummy tensor with the correct sequence position for rotary computation
+            # We need shape (B, cache_len + 1, nh, hs) but only use the last position
+            dummy = torch.zeros(B, cache_len + T, self.n_head, self.head_dim,
+                               device=q.device, dtype=q.dtype)
+            cos, sin = self.rotary(dummy)
+            # Extract rotary embeddings for only the new token position
+            cos = cos[:, cache_len:cache_len+T, :, :]
+            sin = sin[:, cache_len:cache_len+T, :, :]
+            # Apply rotary embeddings to new q and k only
+            q = apply_rotary_emb(q, cos, sin)
+            k = apply_rotary_emb(k, cos, sin)
+        # NOW concatenate with cache (cached keys already have correct RoPE applied)
         if k_cache is not None:
             k = torch.cat((k_cache, k), dim=2)
         if v_cache is not None:
             v = torch.cat((v_cache, v), dim=2)
         # manual implementation of attention
         att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
         att = F.softmax(att, dim=-1)
         y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
         y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
         # output projection
         y = self.c_proj(y)
         return y, k, v