TuKoResearch
/

AuriStream100M_librilight_dev

AuriStream.AuriStream

Model card Files Files and versions

klemenk commited on Apr 17, 2025

Commit

75ed49a

·

verified ·

1 Parent(s): e0d0c9f

Update modeling_auristream.py

Files changed (1) hide show

modeling_auristream.py +27 -0

modeling_auristream.py CHANGED Viewed

@@ -472,6 +472,33 @@ class CausalSelfAttention(nn.Module):
         return y
 class MLP(nn.Module):

         return y
+    def kv_cache_forward(self, x, k_cache=None, v_cache=None):
+        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
+        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
+        q, k, v  = self.c_attn(x).split(self.n_embd, dim=2)
+        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
+        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
+        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
+        # append cached keys and values with new keys and values
+        if k_cache is not None:
+            k = torch.cat((k_cache, k), dim=2)
+        if v_cache is not None:
+            v = torch.cat((v_cache, v), dim=2)
+        # manual implementation of attention
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = F.softmax(att, dim=-1)
+        y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
+        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
+        # output projection
+        y = self.c_proj(y)
+        return y, k, v
 class MLP(nn.Module):