mazesmazes
/

tiny-audio

@@ -392,12 +392,15 @@ class ASRModel(PreTrainedModel, GenerationMixin):
         self,
         audio_features: torch.Tensor,
         audio_attention_mask: torch.Tensor,
     ) -> torch.Tensor:
         """Encode audio and project to LLM embedding space.
         Args:
             audio_features: Mel spectrogram features (batch, n_mels, mel_len)
             audio_attention_mask: Mask indicating real vs padded mel frames (batch, mel_len)
         Returns:
             Flattened audio embeddings of shape (total_audio_tokens, hidden_dim).
@@ -406,24 +409,40 @@ class ASRModel(PreTrainedModel, GenerationMixin):
             encoder_out = self.audio_tower(input_features=audio_features)
             hidden_states = encoder_out.last_hidden_state
-        # Compute per-sample encoder output lengths using conv formulas
-        encoder_lengths = self._compute_encoder_output_lengths(audio_attention_mask)
         # Project to LLM space
         audio_embeds = self.projector(hidden_states)
-        # Compute per-sample projector output lengths
-        projector_lengths = torch.tensor(
-            [self.projector.get_output_length(int(length.item())) for length in encoder_lengths],
-            device=audio_embeds.device,
-        )
-        # Create valid mask for variable-length samples and extract only real embeddings
-        max_len = audio_embeds.shape[1]
-        valid_mask = (
-            torch.arange(max_len, device=audio_embeds.device)[None, :] < projector_lengths[:, None]
-        )
-        return audio_embeds[valid_mask]
     def forward(
         self,
@@ -449,28 +468,16 @@ class ASRModel(PreTrainedModel, GenerationMixin):
             if self.training and self.spec_augment is not None:
                 input_features = self.spec_augment(input_features)
             # Encode audio -> flattened (total_audio_tokens, hidden_dim)
-            audio_embeds = self._encode_audio(input_features, audio_attention_mask)
             # Replace <audio> token placeholders with audio embeddings using masked_scatter
             audio_token_mask = (input_ids == self.audio_token_id).unsqueeze(-1)
-            num_audio_tokens = audio_token_mask.sum() // audio_token_mask.shape[-1]
-            num_audio_embeds = audio_embeds.shape[0]
-            # Handle mismatch between expected tokens and actual embeddings
-            if num_audio_embeds < num_audio_tokens:
-                # Pad audio embeddings with zeros if we have fewer than expected
-                padding = torch.zeros(
-                    num_audio_tokens - num_audio_embeds,
-                    audio_embeds.shape[-1],
-                    device=audio_embeds.device,
-                    dtype=audio_embeds.dtype,
-                )
-                audio_embeds = torch.cat([audio_embeds, padding], dim=0)
-            elif num_audio_embeds > num_audio_tokens:
-                # Truncate if we have more embeddings than tokens
-                audio_embeds = audio_embeds[:num_audio_tokens]
             inputs_embeds = inputs_embeds.masked_scatter(
                 audio_token_mask.to(inputs_embeds.device),
                 audio_embeds.to(inputs_embeds.device, dtype=inputs_embeds.dtype),

         self,
         audio_features: torch.Tensor,
         audio_attention_mask: torch.Tensor,
+        expected_token_counts: torch.Tensor | None = None,
     ) -> torch.Tensor:
         """Encode audio and project to LLM embedding space.
         Args:
             audio_features: Mel spectrogram features (batch, n_mels, mel_len)
             audio_attention_mask: Mask indicating real vs padded mel frames (batch, mel_len)
+            expected_token_counts: Expected number of audio tokens per sample from input_ids.
+                If provided, output will match these counts exactly (padding/truncating as needed).
         Returns:
             Flattened audio embeddings of shape (total_audio_tokens, hidden_dim).
             encoder_out = self.audio_tower(input_features=audio_features)
             hidden_states = encoder_out.last_hidden_state
         # Project to LLM space
         audio_embeds = self.projector(hidden_states)
+        # Use expected token counts if provided (from input_ids), otherwise compute from audio
+        if expected_token_counts is not None:
+            token_counts = expected_token_counts
+        else:
+            # Compute per-sample encoder output lengths using conv formulas
+            encoder_lengths = self._compute_encoder_output_lengths(audio_attention_mask)
+            token_counts = torch.tensor(
+                [self.projector.get_output_length(int(length.item())) for length in encoder_lengths],
+                device=audio_embeds.device,
+            )
+        # Extract embeddings matching expected token counts per sample
+        batch_size = audio_embeds.shape[0]
+        hidden_dim = audio_embeds.shape[2]
+        result_embeds = []
+        for i in range(batch_size):
+            count = int(token_counts[i].item())
+            sample_embeds = audio_embeds[i, :count, :]  # Take first 'count' embeddings
+            # Pad with zeros if we don't have enough embeddings
+            if sample_embeds.shape[0] < count:
+                padding = torch.zeros(
+                    count - sample_embeds.shape[0],
+                    hidden_dim,
+                    device=audio_embeds.device,
+                    dtype=audio_embeds.dtype,
+                )
+                sample_embeds = torch.cat([sample_embeds, padding], dim=0)
+            result_embeds.append(sample_embeds)
+        return torch.cat(result_embeds, dim=0)
     def forward(
         self,
             if self.training and self.spec_augment is not None:
                 input_features = self.spec_augment(input_features)
+            # Count expected audio tokens from input_ids (ground truth from collator)
+            audio_token_counts = (input_ids == self.audio_token_id).sum(dim=-1)
             # Encode audio -> flattened (total_audio_tokens, hidden_dim)
+            audio_embeds = self._encode_audio(
+                input_features, audio_attention_mask, audio_token_counts
+            )
             # Replace <audio> token placeholders with audio embeddings using masked_scatter
             audio_token_mask = (input_ids == self.audio_token_id).unsqueeze(-1)
             inputs_embeds = inputs_embeds.masked_scatter(
                 audio_token_mask.to(inputs_embeds.device),
                 audio_embeds.to(inputs_embeds.device, dtype=inputs_embeds.dtype),