TuKoResearch
/

AuriStream7B_librilight_dev

Safetensors

AuriStream.AuriStream

custom_code

Model card Files Files and versions

xet

Community

klemenk commited on Apr 17, 2025

Commit

3845e7a

verified ·

1 Parent(s): 12d67f7

Update modeling_auristream.py

Browse files

Files changed (1) hide show

modeling_auristream.py +32 -27

modeling_auristream.py CHANGED Viewed

@@ -141,7 +141,6 @@ class AuriStream(PreTrainedModel):
                       top_k: int = 500, top_p: float = 0.5) -> torch.LongTensor:
         """
         Samples an integer from the distribution of logits
         Parameters:
             logits (torch.FloatTensor): The logits of the distribution
             temp (float): The temperature of the sampling, if 0.0, then argmax is used
@@ -403,29 +402,6 @@ class Block(nn.Module):
         return x
-class CausalSelfAttention(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.n_head = config.n_head
-        self.n_embd = config.n_embd
-        self.head_dim = self.n_embd // self.n_head
-        assert self.n_embd % self.n_head == 0
-        # key, query, value projections for all heads, but in a batch
-        self.c_attn = nn.Linear(self.n_embd, 3 * self.n_embd, bias=False)
-        # output projection
-        self.c_proj = nn.Linear(self.n_embd, self.n_embd, bias=False)
-        rope_theta = 500000
-        if hasattr(config, 'rope_theta') and config.rope_theta is not None:
-            rope_theta = config.rope_theta
-        self.rotary = Rotary(self.head_dim, base=rope_theta)
-        if hasattr(config, 'use_rope') and not config.use_rope:
-            self.rotary = None
 class CausalSelfAttention(nn.Module):
     def __init__(self, config):
@@ -469,20 +445,22 @@ class CausalSelfAttention(nn.Module):
                 is_causal=True)
         else:
             # manual implementation of attention
             att = torch.einsum('bnsh,bnkh->bnsk', q, k) * (1.0 / math.sqrt(k.size(-1)))
             mask = torch.triu(torch.ones(T, T), diagonal=1).to(dtype=torch.bool).to(x.device)
             mask = mask.view(1, 1, T, T)
             masked_att = att.masked_fill(mask, float('-inf'))
             # upcast to float32 for numerical stability, as per llama implementation
             masked_att = F.softmax(masked_att, dim=-1, dtype=torch.float32).to(q.dtype)
-            masked_att = self.attn_dropout(masked_att)
             # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
             y = torch.einsum('bnsk,bnkh->bnsh', masked_att, v)
         y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
         # output projection
-        y = self.resid_dropout(self.c_proj(y))
         # return attention maps if requested
         if return_attn_maps:
@@ -494,6 +472,33 @@ class CausalSelfAttention(nn.Module):
         return y
 class MLP(nn.Module):
@@ -560,4 +565,4 @@ class RMSNorm(nn.Module):
         output = self._norm(x.float()).type_as(x)
         if self.weight is not None:
             return output * self.weight
-        return output

                       top_k: int = 500, top_p: float = 0.5) -> torch.LongTensor:
         """
         Samples an integer from the distribution of logits
         Parameters:
             logits (torch.FloatTensor): The logits of the distribution
             temp (float): The temperature of the sampling, if 0.0, then argmax is used
         return x
 class CausalSelfAttention(nn.Module):
     def __init__(self, config):
                 is_causal=True)
         else:
             # manual implementation of attention
+            q = q.transpose(1, 2)
+            k = k.transpose(1, 2)
+            v = v.transpose(1, 2)
             att = torch.einsum('bnsh,bnkh->bnsk', q, k) * (1.0 / math.sqrt(k.size(-1)))
             mask = torch.triu(torch.ones(T, T), diagonal=1).to(dtype=torch.bool).to(x.device)
             mask = mask.view(1, 1, T, T)
             masked_att = att.masked_fill(mask, float('-inf'))
             # upcast to float32 for numerical stability, as per llama implementation
             masked_att = F.softmax(masked_att, dim=-1, dtype=torch.float32).to(q.dtype)
             # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
             y = torch.einsum('bnsk,bnkh->bnsh', masked_att, v)
         y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
         # output projection
+        y = self.c_proj(y)
         # return attention maps if requested
         if return_attn_maps:
         return y
+    def kv_cache_forward(self, x, k_cache=None, v_cache=None):
+        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
+        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
+        q, k, v  = self.c_attn(x).split(self.n_embd, dim=2)
+        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
+        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
+        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
+        # append cached keys and values with new keys and values
+        if k_cache is not None:
+            k = torch.cat((k_cache, k), dim=2)
+        if v_cache is not None:
+            v = torch.cat((v_cache, v), dim=2)
+        # manual implementation of attention
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = F.softmax(att, dim=-1)
+        y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
+        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
+        # output projection
+        y = self.c_proj(y)
+        return y, k, v
 class MLP(nn.Module):
         output = self._norm(x.float()).type_as(x)
         if self.weight is not None:
             return output * self.weight
+        return output