TuKoResearch
/

AuriStream7B_librilight_dev

Safetensors

AuriStream.AuriStream

custom_code

Model card Files Files and versions

xet

Community

klemenk commited on Apr 17, 2025

Commit

40d7f7e

verified ·

1 Parent(s): 8d9662c

Update modeling_auristream.py

Browse files

Files changed (1) hide show

modeling_auristream.py +63 -5

modeling_auristream.py CHANGED Viewed

@@ -425,7 +425,37 @@ class CausalSelfAttention(nn.Module):
         if hasattr(config, 'use_rope') and not config.use_rope:
             self.rotary = None
-    def forward(self, x, return_kv=False, ):
         B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
         # calculate query, key, values for all heads in batch and move head forward to be the batch dim
@@ -440,13 +470,41 @@ class CausalSelfAttention(nn.Module):
             q = apply_rotary_emb(q, cos, sin)
             k = apply_rotary_emb(k, cos, sin)
-        y = F.scaled_dot_product_attention(
-            q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2),
-            is_causal=True)
         y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
         # output projection
-        y = self.c_proj(y)
         return y

         if hasattr(config, 'use_rope') and not config.use_rope:
             self.rotary = None
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.n_head = config.n_head
+        self.n_embd = config.n_embd
+        self.head_dim = self.n_embd // self.n_head
+        assert self.n_embd % self.n_head == 0
+        # key, query, value projections for all heads, but in a batch
+        self.c_attn = nn.Linear(self.n_embd, 3 * self.n_embd, bias=False)
+        # output projection
+        self.c_proj = nn.Linear(self.n_embd, self.n_embd, bias=False)
+        rope_theta = 500000
+        if hasattr(config, 'rope_theta') and config.rope_theta is not None:
+            rope_theta = config.rope_theta
+        self.rotary = Rotary(self.head_dim, base=rope_theta)
+        if hasattr(config, 'use_rope') and not config.use_rope:
+            self.rotary = None
+        # Check if we are running on TPU
+        try:
+            import torch_xla.core.xla_model as xm
+            self.tpu = True
+        except ImportError:
+            self.tpu = False
+    def forward(self, x, return_kv=False, return_attn_maps=False):
         B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
         # calculate query, key, values for all heads in batch and move head forward to be the batch dim
             q = apply_rotary_emb(q, cos, sin)
             k = apply_rotary_emb(k, cos, sin)
+        if self.tpu and not return_kv and not return_attn_maps:
+            from torch_xla.experimental.custom_kernel import flash_attention
+            q_norm = q / math.sqrt(k.size(-1))
+            y = flash_attention(
+                q_norm.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2),
+                causal=True, partition_spec=('fsdp', None, None, None))
+        elif not return_kv and not return_attn_maps:
+            y = F.scaled_dot_product_attention(
+                q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2),
+                is_causal=True)
+        else:
+            # manual implementation of attention
+            att = torch.einsum('bnsh,bnkh->bnsk', q, k) * (1.0 / math.sqrt(k.size(-1)))
+            mask = torch.triu(torch.ones(T, T), diagonal=1).to(dtype=torch.bool).to(x.device)
+            mask = mask.view(1, 1, T, T)
+            masked_att = att.masked_fill(mask, float('-inf'))
+            # upcast to float32 for numerical stability, as per llama implementation
+            masked_att = F.softmax(masked_att, dim=-1, dtype=torch.float32).to(q.dtype)
+            masked_att = self.attn_dropout(masked_att)
+            # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
+            y = torch.einsum('bnsk,bnkh->bnsh', masked_att, v)
         y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
         # output projection
+        y = self.resid_dropout(self.c_proj(y))
+        # return attention maps if requested
+        if return_attn_maps:
+            return y, F.softmax(att, dim=-1)
+        # return key and value caches if requested
+        if return_kv:
+            return y, k, v
         return y