TuKoResearch
/

AuriStream7B_40Pred_BigAudioDataset_250k

Safetensors

AuriStream.AuriStream

custom_code

Model card Files Files and versions

xet

Community

klemenk commited on Sep 23, 2025

Commit

c2cc9ee

verified ·

1 Parent(s): 28d0dca

Update modeling_auristream.py

Browse files

Files changed (1) hide show

modeling_auristream.py +64 -43

modeling_auristream.py CHANGED Viewed

@@ -111,30 +111,35 @@ class AuriStream(PreTrainedModel):
         x = self.transformer.ln_f(x)
         logits = self.coch_head(x)
-        if tgt is not None:
-            if output_logits:
-                all_logits = [logits]
             loss = F.cross_entropy(
                 logits.reshape(-1, self.config.vocab_size), tgt.reshape(-1),
             )
-            # If we have more than one future head, compute the loss for each head
-            if self.future_heads is not None:
-                for i, head in enumerate(self.future_heads):
-                    future_logits = head(x[:, :-(i+1)])
                     loss += F.cross_entropy(
                         future_logits.reshape(-1, self.config.vocab_size), tgt[:, (i+1):].reshape(-1),
                     )
-                    if output_logits:
-                        all_logits.append(future_logits)
                 # divide loss by number of future heads
                 loss = loss / (len(self.future_heads) + 1)
-            if return_dict:
-                if output_logits:
-                    if output_hidden_states:
                         model_output = CausalLMOutput(
                             loss=loss,
                             logits=all_logits,
@@ -142,23 +147,45 @@ class AuriStream(PreTrainedModel):
                         )
                     else:
                         model_output = CausalLMOutput(
-                            loss=loss,
                             logits=all_logits,
                         )
                 else:
-                    if output_hidden_states:
                         model_output = CausalLMOutput(
                             loss=loss,
                             logits=logits,
                             hidden_states=all_hidden_states,
                         )
                     else:
                         model_output = CausalLMOutput(
                             loss=loss,
                             logits=logits,
                         )
-                return model_output
             return logits, loss
         return logits, None
@@ -215,26 +242,35 @@ class AuriStream(PreTrainedModel):
         return sampled
     @torch.no_grad()
-    def generate(self, seq: torch.Tensor, n_tokens: int = 1, temp=1.0,
-        top_k=500, top_p=0.5, seed=None):
         """
         Parameters:
-            seq: torch.Tensor of shape (b, t, n_freq_bins)
-                Input cochleagram to use for generation
             n_tokens: int
-                Number of time bins to predict
             temp: float
                 Temperature for sampling logits
             seed: int
                 Random seed for sampling
         Returns:
-            pred_coch: torch.Tensor of shape (b, t, n_freq_bins)
-                The predicted cochleagram
-            all_logits: (optional if return_logits is True) torch.Tensor of shape (b, n_tokens, n_freq_bins)
-                The logits for each time step
-            all_embs: (optional if return_embs is not None) list of torch.Tensor
-                The embeddings for each transformer block
         """
         # Set seed if provided
@@ -250,14 +286,6 @@ class AuriStream(PreTrainedModel):
         # grab shape of the cochleagram
         b, t = seq.size()
-        # TODO: double check this works then delete the block bellow:
-        # pass the given input through the model to get the predictions and cache
-        # the k and v values for each transformer block in the process
-        # pos = torch.arange(0, t, dtype=torch.long, device=device)
-        # tok_emb = self.transformer.wte(seq) # token embeddings of shape (b, t, n_embd)
-        # pos_emb = self.transformer.wpe(pos) # position embeddings of shape (t, n_embd)
-        # x = self.transformer.drop(tok_emb + pos_emb)
         #### Embed conditioning sequence into KV cache
         tok_emb = self.transformer.wte(seq) # token embeddings of shape (b, t, n_embd)
@@ -295,13 +323,6 @@ class AuriStream(PreTrainedModel):
         # using the last embedding of the input
         for i in range(n_tokens-1):
-            # TODO: double check this works then delete the block bellow:
-            # # Get the emb and pos embedding of just the last token
-            # pos = torch.arange(t+i, t+i+1, dtype=torch.long, device=device) # shape (t)
-            # tok_emb = self.transformer.wte(predictions[-1]) # token embeddings of shape (b, t, n_embd)
-            # pos_emb = self.transformer.wpe(pos) # position embeddings of shape (t, n_embd)
-            # x = self.transformer.drop(tok_emb + pos_emb)
             # Get the emb and pos embedding of just the last token
             tok_emb = self.transformer.wte(predictions[-1]) # token embeddings of shape (b, t, n_embd)
             # if wpe exists in self.transformer apply leanred positional embedding

         x = self.transformer.ln_f(x)
         logits = self.coch_head(x)
+        if output_logits:
+            all_logits = [logits]
+        if tgt is not None:
             loss = F.cross_entropy(
                 logits.reshape(-1, self.config.vocab_size), tgt.reshape(-1),
             )
+        # If we have more than one future head, compute the loss for each head
+        if self.future_heads is not None:
+            for i, head in enumerate(self.future_heads):
+                future_logits = head(x[:, :-(i+1)])
+                if tgt is not None:
                     loss += F.cross_entropy(
                         future_logits.reshape(-1, self.config.vocab_size), tgt[:, (i+1):].reshape(-1),
                     )
+                if output_logits:
+                    all_logits.append(future_logits)
+            if tgt is not None:
                 # divide loss by number of future heads
                 loss = loss / (len(self.future_heads) + 1)
+        if return_dict:
+            if output_logits:
+                if output_hidden_states:
+                    if tgt is not None:
                         model_output = CausalLMOutput(
                             loss=loss,
                             logits=all_logits,
                         )
                     else:
                         model_output = CausalLMOutput(
                             logits=all_logits,
+                            hidden_states=all_hidden_states,
                         )
                 else:
+                    if tgt is not None:
+                        model_output = CausalLMOutput(
+                            loss=loss,
+                            logits=all_logits,
+                        )
+                    else:
+                        model_output = CausalLMOutput(
+                            logits=all_logits,
+                        )
+            else:
+                if output_hidden_states:
+                    if tgt is not None:
                         model_output = CausalLMOutput(
                             loss=loss,
                             logits=logits,
                             hidden_states=all_hidden_states,
                         )
                     else:
+                        model_output = CausalLMOutput(
+                            logits=logits,
+                            hidden_states=all_hidden_states,
+                        )
+                else:
+                    if tgt is not None:
                         model_output = CausalLMOutput(
                             loss=loss,
                             logits=logits,
                         )
+                    else:
+                        model_output = CausalLMOutput(
+                            logits=logits,
+                        )
+            return model_output
+        if tgt is not None:
             return logits, loss
         return logits, None
         return sampled
     @torch.no_grad()
+    def generate(
+        self,
+        seq: torch.Tensor,
+        n_tokens: int = 1,
+        temp: float = 1.0,
+        top_k: int = None,
+        top_p: float = None,
+        seed: int = None,
+    ):
         """
         Parameters:
+            seq: torch.Tensor of shape (b, t)
+                Input cochlear tokens to condition the generation
             n_tokens: int
+                Number of future tokens (5ms time bins) to predict
             temp: float
                 Temperature for sampling logits
+            top_k: int
+                Restrict sampling to k tokens with highest probability (sample from all tokens if None)
+            top_p: float
+                Restrict sampling to most probable tokens with cumulative probability of p (sample form all tokens if None)
             seed: int
                 Random seed for sampling
         Returns:
+            pred_coch: torch.Tensor of shape (b, t)
+                The generated cochlear tokens
+            all_logits: torch.Tensor of shape (b, n_tokens, vocab_size)
+                The logits at each time step
         """
         # Set seed if provided
         # grab shape of the cochleagram
         b, t = seq.size()
         #### Embed conditioning sequence into KV cache
         tok_emb = self.transformer.wte(seq) # token embeddings of shape (b, t, n_embd)
         # using the last embedding of the input
         for i in range(n_tokens-1):
             # Get the emb and pos embedding of just the last token
             tok_emb = self.transformer.wte(predictions[-1]) # token embeddings of shape (b, t, n_embd)
             # if wpe exists in self.transformer apply leanred positional embedding