TuKoResearch
/

AuriStream100M_40Pred_BigAudioDataset_500k

@@ -165,7 +165,6 @@ class AuriStream(PreTrainedModel):
                     )
                     if output_logits:
                         all_logits.append(future_logits)
-                loss = loss / (len(self.future_heads) + 1)
             if return_dict:
                 if output_logits:
@@ -195,12 +194,47 @@ class AuriStream(PreTrainedModel):
                 return model_output
             return logits, loss
         return logits, None
     def sample_logits(self, logits: torch.FloatTensor, temperature: float = 0.9,
-                      top_k: int = 500, top_p: float = 0.5) -> torch.LongTensor:
         """
         Samples an integer from the distribution of logits
         Parameters:
@@ -252,7 +286,7 @@ class AuriStream(PreTrainedModel):
     @torch.no_grad()
     def generate(self, seq: torch.Tensor, n_tokens: int = 1, temp=1.0,
-        top_k=500, top_p=0.5, seed=None):
         """
         Parameters:
             seq: torch.Tensor of shape (b, t, n_freq_bins)
@@ -321,7 +355,7 @@ class AuriStream(PreTrainedModel):
         # First prediction of the model is the decoding of the last time bin
         logits = self.coch_head(x[:, [-1]])
-        predictions = [self.sample_logits(logits, temperature=temp)]
         all_logits.append(logits)
         ### Predict future tokens
@@ -534,31 +568,82 @@ class CausalSelfAttention(nn.Module):
         return y
-    def kv_cache_forward(self, x, k_cache=None, v_cache=None):
-        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
         # calculate query, key, values for all heads in batch and move head forward to be the batch dim
         q, k, v  = self.c_attn(x).split(self.n_embd, dim=2)
-        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
-        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
-        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
-        # append cached keys and values with new keys and values
         if k_cache is not None:
             k = torch.cat((k_cache, k), dim=2)
         if v_cache is not None:
             v = torch.cat((v_cache, v), dim=2)
         # manual implementation of attention
         att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
         att = F.softmax(att, dim=-1)
         y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
         y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
         # output projection
         y = self.c_proj(y)
         return y, k, v

                     )
                     if output_logits:
                         all_logits.append(future_logits)
             if return_dict:
                 if output_logits:
                 return model_output
             return logits, loss
+        else:
+            if output_logits:
+                all_logits = [logits]
+            # future multi-step heads (unchanged)
+            if self.future_heads is not None:
+                for i, head in enumerate(self.future_heads):
+                    future_logits = head(x[:, :-(i + 1)])
+                    if output_logits:
+                        all_logits.append(future_logits)
+            if return_dict:
+                if output_logits:
+                    if output_hidden_states:
+                        model_output = CausalLMOutput(
+                            logits=all_logits,
+                            hidden_states=hs_to_return,
+                        )
+                    else:
+                        model_output = CausalLMOutput(
+                            logits=all_logits,
+                        )
+                else:
+                    if output_hidden_states:
+                        model_output = CausalLMOutput(
+                            logits=logits,
+                            hidden_states=hs_to_return,
+                        )
+                    else:
+                        model_output = CausalLMOutput(
+                            logits=logits,
+                        )
+                return model_output
+            return logits, loss
         return logits, None
     def sample_logits(self, logits: torch.FloatTensor, temperature: float = 0.9,
+                      top_k: int = None, top_p: float = None) -> torch.LongTensor:
         """
         Samples an integer from the distribution of logits
         Parameters:
     @torch.no_grad()
     def generate(self, seq: torch.Tensor, n_tokens: int = 1, temp=1.0,
+        top_k=None, top_p=None, seed=None):
         """
         Parameters:
             seq: torch.Tensor of shape (b, t, n_freq_bins)
         # First prediction of the model is the decoding of the last time bin
         logits = self.coch_head(x[:, [-1]])
+        predictions = [self.sample_logits(logits, temperature=temp, top_k=top_k, top_p=top_p)]
         all_logits.append(logits)
         ### Predict future tokens
         return y
+    # def kv_cache_forward(self, x, k_cache=None, v_cache=None):
+    #     B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
+    #     # calculate query, key, values for all heads in batch and move head forward to be the batch dim
+    #     q, k, v  = self.c_attn(x).split(self.n_embd, dim=2)
+    #     k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
+    #     q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
+    #     v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
+    #     # append cached keys and values with new keys and values
+    #     if k_cache is not None:
+    #         k = torch.cat((k_cache, k), dim=2)
+    #     if v_cache is not None:
+    #         v = torch.cat((v_cache, v), dim=2)
+    #     if self.rotary is not None:
+    #         cos, sin = self.rotary(q)
+    #         q = apply_rotary_emb(q, cos, sin)
+    #         k = apply_rotary_emb(k, cos, sin)
+    #     # manual implementation of attention
+    #     att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+    #     att = F.softmax(att, dim=-1)
+    #     y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
+    #     y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
+    #     # output projection
+    #     y = self.c_proj(y)
+    #     return y, k, v
+    def kv_cache_forward(self, x, k_cache=None, v_cache=None):
+        B, T, C = x.size() # T=1 for single new token
         # calculate query, key, values for all heads in batch and move head forward to be the batch dim
         q, k, v  = self.c_attn(x).split(self.n_embd, dim=2)
+        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, 1, hs)
+        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, 1, hs)
+        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, 1, hs)
+        # Apply RoPE BEFORE concatenation, using correct absolute position
+        if self.rotary is not None:
+            # Determine the position of the new token
+            cache_len = k_cache.shape[2] if k_cache is not None else 0
+            # Create a dummy tensor with the correct sequence position for rotary computation
+            # We need shape (B, cache_len + 1, nh, hs) but only use the last position
+            dummy = torch.zeros(B, cache_len + T, self.n_head, self.head_dim,
+                               device=q.device, dtype=q.dtype)
+            cos, sin = self.rotary(dummy)
+            # Extract rotary embeddings for only the new token position
+            cos = cos[:, cache_len:cache_len+T, :, :]
+            sin = sin[:, cache_len:cache_len+T, :, :]
+            # Apply rotary embeddings to new q and k only
+            q = apply_rotary_emb(q, cos, sin)
+            k = apply_rotary_emb(k, cos, sin)
+        # NOW concatenate with cache (cached keys already have correct RoPE applied)
         if k_cache is not None:
             k = torch.cat((k_cache, k), dim=2)
         if v_cache is not None:
             v = torch.cat((v_cache, v), dim=2)
         # manual implementation of attention
         att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
         att = F.softmax(att, dim=-1)
         y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
         y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
         # output projection
         y = self.c_proj(y)
         return y, k, v