TuKoResearch
/

AuriStreamParallel-base

@@ -198,6 +198,10 @@ class AuriStreamModel(AuriStreamPreTrainedModel):
         bsz, tg, gsz, vsz = logits.shape
         return logits.reshape(bsz, tg * gsz, vsz)
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
@@ -214,16 +218,28 @@ class AuriStreamModel(AuriStreamPreTrainedModel):
         if input_ids is None:
             raise ValueError("input_ids (or seq) must be provided")
         x = self._group_embed(input_ids)
         all_hidden_states = ()
         if output_hidden_states:
-            all_hidden_states = (x,)
         for block in self.h:
             x = block(x)
             if output_hidden_states:
-                all_hidden_states = all_hidden_states + (x,)
         x = self.ln_f(x)
         logits = self._decode_parallel_logits(x)

         bsz, tg, gsz, vsz = logits.shape
         return logits.reshape(bsz, tg * gsz, vsz)
+    def _expand_group_hidden(self, x: torch.Tensor, target_len: int) -> torch.Tensor:
+        expanded = x.repeat_interleave(self.group_size, dim=1)
+        return expanded[:, :target_len, :]
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
         if input_ids is None:
             raise ValueError("input_ids (or seq) must be provided")
+        usable_len = (input_ids.shape[1] // self.group_size) * self.group_size
+        if usable_len <= 0:
+            raise ValueError(
+                f"Input sequence length {input_ids.shape[1]} is too short for group_size={self.group_size}"
+            )
+        if usable_len != input_ids.shape[1]:
+            input_ids = input_ids[:, :usable_len]
+            if labels is not None:
+                labels = labels[:, :usable_len]
         x = self._group_embed(input_ids)
         all_hidden_states = ()
         if output_hidden_states:
+            all_hidden_states = (self._expand_group_hidden(x, target_len=usable_len),)
         for block in self.h:
             x = block(x)
             if output_hidden_states:
+                all_hidden_states = all_hidden_states + (
+                    self._expand_group_hidden(x, target_len=usable_len),
+                )
         x = self.ln_f(x)
         logits = self._decode_parallel_logits(x)