TuKoResearch
/

AuriStream7B_librilight_dev

AuriStream.AuriStream

Model card Files Files and versions

klemenk commited on Apr 17, 2025

Commit

b500e3b

·

verified ·

1 Parent(s): e44a01f

Update modeling_auristream.py

Files changed (1) hide show

modeling_auristream.py +47 -6

modeling_auristream.py CHANGED Viewed

@@ -487,14 +487,55 @@ class CausalSelfAttention(nn.Module):
         if v_cache is not None:
             v = torch.cat((v_cache, v), dim=2)
-        # manual implementation of attention
-        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
-        att = F.softmax(att, dim=-1)
-        y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
-        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
-        # output projection
         y = self.c_proj(y)
         return y, k, v

         if v_cache is not None:
             v = torch.cat((v_cache, v), dim=2)
+        if not return_kv and not return_attn_maps:
+            y = F.scaled_dot_product_attention(
+                q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2),
+                is_causal=True)
+        else:
+            # manual implementation of attention
+            att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+            att = F.softmax(att, dim=-1)
+            y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
+            y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
+            # output projection
+            y = self.c_proj(y)
+        return y, k, v
+    def kv_cache_forward(
+        self,
+        x: torch.Tensor,
+        k_cache: torch.Tensor | None = None,
+        v_cache: torch.Tensor | None = None,
+    ):
+        B, T, C = x.size()
+        q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
+        q = q.view(B, T, self.n_head, self.head_dim)   # (B, T, n_head, d)
+        k = k.view(B, T, self.n_head, self.head_dim)
+        v = v.view(B, T, self.n_head, self.head_dim)
+        if self.rotary is not None:
+            cos, sin = self.rotary(q)                  # cos/sin match (B, T, n_head, d)
+            q = apply_rotary_emb(q, cos, sin)
+            k = apply_rotary_emb(k, cos, sin)
+        q = q.transpose(1, 2)                          # (B, n_head, T, d)
+        k = k.transpose(1, 2)
+        v = v.transpose(1, 2)
+        if k_cache is not None:
+            k = torch.cat([k_cache, k], dim=2)         # time dim grows
+        if v_cache is not None:
+            v = torch.cat([v_cache, v], dim=2)
+        y = F.scaled_dot_product_attention(
+            q, k, v, is_causal=True                    # PyTorch ≥ 2.1
+        )                                              # (B, n_head, T, d)
+        y = y.transpose(1, 2).contiguous().view(B, T, C)
         y = self.c_proj(y)
         return y, k, v