TuKoResearch
/

AuriStream-base

@@ -290,10 +290,12 @@ class AuriStreamModel(AuriStreamPreTrainedModel):
         self,
         input_ids: Optional[torch.LongTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         output_hidden_states: Optional[bool] = False,
         return_dict: Optional[bool] = True,
         # Legacy arguments for compatibility
-        return_logits: Optional[bool] = True,
         seq: Optional[torch.LongTensor] = None,
         tgt: Optional[torch.LongTensor] = None,
     ):
@@ -303,13 +305,16 @@ class AuriStreamModel(AuriStreamPreTrainedModel):
         Args:
             input_ids: Input token IDs of shape (batch_size, seq_len)
             labels: Target token IDs for computing loss
             output_hidden_states: Whether to return all hidden states
             return_dict: Whether to return a dict or tuple
             seq: Legacy argument (alias for input_ids)
             tgt: Legacy argument (alias for labels)
         Returns:
-            CausalLMOutput with logits and optional loss
         """
         # Handle legacy arguments
         if seq is not None:
@@ -321,22 +326,55 @@ class AuriStreamModel(AuriStreamPreTrainedModel):
         tok_emb = self.wte(input_ids)
         x = self.drop(tok_emb)
-        # Collect hidden states if requested
         all_hidden_states = []
         # Forward through transformer blocks
-        for block in self.h:
-            if output_hidden_states:
-                all_hidden_states.append(x)
             x = block(x)
-        if output_hidden_states:
             all_hidden_states.append(x)
         # Final layer norm and output head
         x = self.ln_f(x)
         logits = self.lm_head(x)
         # Compute loss if labels provided
         loss = None
         if labels is not None:
@@ -348,21 +386,21 @@ class AuriStreamModel(AuriStreamPreTrainedModel):
             # Multi-token prediction loss
             if self.future_heads is not None:
                 for i, head in enumerate(self.future_heads):
-                    future_logits = head(x[:, :-(i+1)])
                     loss = loss + F.cross_entropy(
                         future_logits.reshape(-1, self.config.vocab_size),
-                        labels[:, (i+1):].reshape(-1),
                     )
         if not return_dict:
             if labels is not None:
-                return logits, loss
-            return logits.unsqueeze(0), None
         return CausalLMOutput(
             loss=loss,
-            logits=logits.unsqueeze(0),
-            hidden_states=all_hidden_states if output_hidden_states else None,
         )
     def sample_logits(

         self,
         input_ids: Optional[torch.LongTensor] = None,
         labels: Optional[torch.LongTensor] = None,
+        output_logits: Optional[bool] = False,
         output_hidden_states: Optional[bool] = False,
         return_dict: Optional[bool] = True,
+        up_until_layer: Optional[int] = None,
+        normalize_embeddings: Optional[str] = None,
         # Legacy arguments for compatibility
         seq: Optional[torch.LongTensor] = None,
         tgt: Optional[torch.LongTensor] = None,
     ):
         Args:
             input_ids: Input token IDs of shape (batch_size, seq_len)
             labels: Target token IDs for computing loss
+            output_logits: Whether to return all logits (including from future heads)
             output_hidden_states: Whether to return all hidden states
             return_dict: Whether to return a dict or tuple
+            up_until_layer: Stop forward pass at this layer index
+            normalize_embeddings: 'l2' or 'learned' to normalize hidden states
             seq: Legacy argument (alias for input_ids)
             tgt: Legacy argument (alias for labels)
         Returns:
+            CausalLMOutput with logits and optional loss, or tuple
         """
         # Handle legacy arguments
         if seq is not None:
         tok_emb = self.wte(input_ids)
         x = self.drop(tok_emb)
+        # Collect hidden states
         all_hidden_states = []
         # Forward through transformer blocks
+        for block_idx, block in enumerate(self.h):
+            all_hidden_states.append(x)
+            if up_until_layer is not None and block_idx == up_until_layer:
+                break
             x = block(x)
+        # Append final pre-ln_f state if we didn't exit early
+        if up_until_layer is None or block_idx == len(self.h) - 1:
             all_hidden_states.append(x)
+        # Normalize hidden states if requested
+        hs_to_return = all_hidden_states
+        if output_hidden_states and normalize_embeddings is not None:
+            if normalize_embeddings == 'l2':
+                hs_to_return = [F.normalize(h, p=2, dim=-1) for h in all_hidden_states]
+            elif normalize_embeddings == 'learned':
+                hs_to_return = []
+                L = len(self.h)
+                for i, h in enumerate(all_hidden_states):
+                    if i < L:
+                        hs_to_return.append(self.h[i].norm1(h))
+                    else:
+                        hs_to_return.append(self.ln_f(h))
+        # If only hidden states requested (not logits), return early
+        if output_hidden_states and not output_logits and labels is None:
+            return BaseModelOutput(
+                last_hidden_state=x,
+                hidden_states=hs_to_return,
+            )
         # Final layer norm and output head
         x = self.ln_f(x)
         logits = self.lm_head(x)
+        # Collect all logits if requested
+        all_logits = [logits] if output_logits else None
+        # Compute future head logits
+        if self.future_heads is not None:
+            for i, head in enumerate(self.future_heads):
+                future_logits = head(x[:, :-(i + 1)])
+                if output_logits:
+                    all_logits.append(future_logits)
         # Compute loss if labels provided
         loss = None
         if labels is not None:
             # Multi-token prediction loss
             if self.future_heads is not None:
                 for i, head in enumerate(self.future_heads):
+                    future_logits = head(x[:, :-(i + 1)])
                     loss = loss + F.cross_entropy(
                         future_logits.reshape(-1, self.config.vocab_size),
+                        labels[:, (i + 1):].reshape(-1),
                     )
         if not return_dict:
             if labels is not None:
+                return (all_logits if output_logits else logits), loss
+            return (all_logits if output_logits else logits), None
         return CausalLMOutput(
             loss=loss,
+            logits=all_logits if output_logits else logits,
+            hidden_states=hs_to_return if output_hidden_states else None,
         )
     def sample_logits(