TuKoResearch
/

AuriStream7B_40Pred_BigAudioDataset_500k

Safetensors

AuriStream.AuriStream

custom_code

Model card Files Files and versions

xet

Community

klemenk commited on Oct 1, 2025

Commit

192bd1d

verified ·

1 Parent(s): abddcc6

Update modeling_auristream.py

Browse files

Files changed (1) hide show

modeling_auristream.py +64 -29

modeling_auristream.py CHANGED Viewed

@@ -72,73 +72,108 @@ class AuriStream(PreTrainedModel):
         elif isinstance(module, nn.Embedding):
             torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
-    def forward(self, seq, tgt=None, output_logits=False, output_hidden_states=False, return_dict=False, up_until_layer=None):
         """
-        Input: coch: torch.Tensor of shape (b, t)
-               tgt_coch: torch.Tensor of shape (b, t) or None
         """
         # forward the GPT model itself
-        tok_emb = self.transformer.wte(seq) # token embeddings of shape (b, t, n_embd)
-        # if wpe exists in self.transformer apply leanred positional embedding
         if hasattr(self.transformer, 'wpe'):
             pos = torch.arange(0, seq.size(1), dtype=torch.long, device=seq.device)
-            pos_emb = self.transformer.wpe(pos) # position embeddings of shape (t, n_embd)
             x = self.transformer.drop(tok_emb + pos_emb)
         else:
             x = self.transformer.drop(tok_emb)
         all_hidden_states = []
         for block_idx, block in enumerate(self.transformer.h):
-            # Forward the block
             all_hidden_states.append(x)
             if up_until_layer is not None and block_idx == up_until_layer:
                 break
             x = block(x)
-        # append the last hidden state if we did not exit early
         if up_until_layer is None or block_idx == len(self.transformer.h) - 1:
             all_hidden_states.append(x)
         if output_hidden_states and not output_logits:
             model_output = BaseModelOutput(
-                last_hidden_state=x,
-                hidden_states=all_hidden_states,
             )
             return model_output
         x = self.transformer.ln_f(x)
         logits = self.coch_head(x)
         if tgt is not None:
             if output_logits:
                 all_logits = [logits]
             loss = F.cross_entropy(
                 logits.reshape(-1, self.config.vocab_size), tgt.reshape(-1),
             )
-            # If we have more than one future head, compute the loss for each head
             if self.future_heads is not None:
                 for i, head in enumerate(self.future_heads):
-                    future_logits = head(x[:, :-(i+1)])
                     loss += F.cross_entropy(
-                        future_logits.reshape(-1, self.config.vocab_size), tgt[:, (i+1):].reshape(-1),
                     )
                     if output_logits:
                         all_logits.append(future_logits)
-                # divide loss by number of future heads
                 loss = loss / (len(self.future_heads) + 1)
             if return_dict:
                 if output_logits:
                     if output_hidden_states:
                         model_output = CausalLMOutput(
                             loss=loss,
                             logits=all_logits,
-                            hidden_states=all_hidden_states,
                         )
                     else:
                         model_output = CausalLMOutput(
@@ -150,7 +185,7 @@ class AuriStream(PreTrainedModel):
                         model_output = CausalLMOutput(
                             loss=loss,
                             logits=logits,
-                            hidden_states=all_hidden_states,
                         )
                     else:
                         model_output = CausalLMOutput(
@@ -158,9 +193,9 @@ class AuriStream(PreTrainedModel):
                             logits=logits,
                         )
                 return model_output
             return logits, loss
         return logits, None
     def sample_logits(self, logits: torch.FloatTensor, temperature: float = 0.9,

         elif isinstance(module, nn.Embedding):
             torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+    def forward(
+        self,
+        seq,
+        tgt=None,
+        output_logits=False,
+        output_hidden_states=False,
+        return_dict=False,
+        up_until_layer=None,
+        normalize_embeddings=None,
+    ):
         """
+        Input:  seq: torch.Tensor of shape (b, t)
+                tgt: torch.Tensor of shape (b, t) or None
+        Behavior (unchanged unless normalize_embeddings is set and output_hidden_states=True):
+          - When normalize_embeddings is None: identical to prior behavior.
+          - When normalize_embeddings in {'l2','learned'} and output_hidden_states=True:
+                the list returned in `hidden_states` is normalized per request.
+                (logits/loss computation remains unchanged.)
         """
         # forward the GPT model itself
+        tok_emb = self.transformer.wte(seq)  # (b, t, n_embd)
+        # learned positional embeddings if present
         if hasattr(self.transformer, 'wpe'):
             pos = torch.arange(0, seq.size(1), dtype=torch.long, device=seq.device)
+            pos_emb = self.transformer.wpe(pos)  # (t, n_embd)
             x = self.transformer.drop(tok_emb + pos_emb)
         else:
             x = self.transformer.drop(tok_emb)
         all_hidden_states = []
         for block_idx, block in enumerate(self.transformer.h):
+            # capture pre-block hidden state
             all_hidden_states.append(x)
             if up_until_layer is not None and block_idx == up_until_layer:
                 break
             x = block(x)
+        # append final pre-ln_f state if we did not exit early
         if up_until_layer is None or block_idx == len(self.transformer.h) - 1:
             all_hidden_states.append(x)
+        # optional normalization of hidden states for returning
+        hs_to_return = all_hidden_states
+        if output_hidden_states and normalize_embeddings is not None:
+            if normalize_embeddings == 'l2':
+                hs_to_return = [F.normalize(h, p=2, dim=-1) for h in all_hidden_states]
+            elif normalize_embeddings == 'learned':
+                hs_to_return = []
+                L = len(self.transformer.h)
+                for i, h in enumerate(all_hidden_states):
+                    if i < L:
+                        # input emb -> block0.norm1, block0 out -> block1.norm1, ...
+                        hs_to_return.append(self.transformer.h[i].norm1(h))
+                    else:
+                        # final layer -> transformer.ln_f
+                        hs_to_return.append(self.transformer.ln_f(h))
+            else:
+                # any other value behaves like None (no normalization)
+                hs_to_return = all_hidden_states
+        # if only hidden states are requested (and not logits), return here
         if output_hidden_states and not output_logits:
             model_output = BaseModelOutput(
+                last_hidden_state=x,           # unchanged (pre-ln_f), to preserve original behavior
+                hidden_states=hs_to_return,    # possibly normalized per the new option
             )
             return model_output
+        # standard logits path (unchanged)
         x = self.transformer.ln_f(x)
         logits = self.coch_head(x)
         if tgt is not None:
             if output_logits:
                 all_logits = [logits]
             loss = F.cross_entropy(
                 logits.reshape(-1, self.config.vocab_size), tgt.reshape(-1),
             )
+            # future multi-step heads (unchanged)
             if self.future_heads is not None:
                 for i, head in enumerate(self.future_heads):
+                    future_logits = head(x[:, :-(i + 1)])
                     loss += F.cross_entropy(
+                        future_logits.reshape(-1, self.config.vocab_size),
+                        tgt[:, (i + 1):].reshape(-1),
                     )
                     if output_logits:
                         all_logits.append(future_logits)
                 loss = loss / (len(self.future_heads) + 1)
             if return_dict:
                 if output_logits:
                     if output_hidden_states:
                         model_output = CausalLMOutput(
                             loss=loss,
                             logits=all_logits,
+                            hidden_states=hs_to_return,
                         )
                     else:
                         model_output = CausalLMOutput(
                         model_output = CausalLMOutput(
                             loss=loss,
                             logits=logits,
+                            hidden_states=hs_to_return,
                         )
                     else:
                         model_output = CausalLMOutput(
                             logits=logits,
                         )
                 return model_output
             return logits, loss
         return logits, None
     def sample_logits(self, logits: torch.FloatTensor, temperature: float = 0.9,