TuKoResearch
/

AuriStreamParallel-base

@@ -198,10 +198,6 @@ class AuriStreamModel(AuriStreamPreTrainedModel):
         bsz, tg, gsz, vsz = logits.shape
         return logits.reshape(bsz, tg * gsz, vsz)
-    def _expand_group_hidden(self, x: torch.Tensor, target_len: int) -> torch.Tensor:
-        expanded = x.repeat_interleave(self.group_size, dim=1)
-        return expanded[:, :target_len, :]
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
@@ -232,14 +228,12 @@ class AuriStreamModel(AuriStreamPreTrainedModel):
         all_hidden_states = ()
         if output_hidden_states:
-            all_hidden_states = (self._expand_group_hidden(x, target_len=usable_len),)
         for block in self.h:
             x = block(x)
             if output_hidden_states:
-                all_hidden_states = all_hidden_states + (
-                    self._expand_group_hidden(x, target_len=usable_len),
-                )
         x = self.ln_f(x)
         logits = self._decode_parallel_logits(x)

         bsz, tg, gsz, vsz = logits.shape
         return logits.reshape(bsz, tg * gsz, vsz)
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
         all_hidden_states = ()
         if output_hidden_states:
+            all_hidden_states = (x,)
         for block in self.h:
             x = block(x)
             if output_hidden_states:
+                all_hidden_states = all_hidden_states + (x,)
         x = self.ln_f(x)
         logits = self._decode_parallel_logits(x)