moonshotai
/

Kimi-Audio-7B-Instruct

@@ -685,14 +685,13 @@ class MoonshotKimiaModel(Qwen2PreTrainedModel):
                     .to(torch.cuda.current_device())
                     .to(whisper_dtype)
                 )
-                for (seg_idx, start_idx), (_, end_idx) in zip(
                     media_start_idx, media_end_idx
-                ):
-                    # assert whisper_emb.shape[1] == end_idx - (start_idx + 1)
                     feat_len = end_idx - (start_idx + 1)
                     whisper_input_feature_i = whisper_input_feature[seg_idx].squeeze(0)
-                    assert feat_len == is_continuous_mask[seg_idx].sum()
                     expanded_whisper[start_idx + 1 : end_idx, :] = (
                         whisper_input_feature_i[:feat_len, :]
                     )

                     .to(torch.cuda.current_device())
                     .to(whisper_dtype)
                 )
+                assert (media_end_idx - media_start_idx).sum() - media_start_idx.shape[0] == is_continuous_mask.sum()
+                for seg_idx, ((batch_idx, start_idx), (_, end_idx)) in enumerate(zip(
                     media_start_idx, media_end_idx
+                )):
                     feat_len = end_idx - (start_idx + 1)
                     whisper_input_feature_i = whisper_input_feature[seg_idx].squeeze(0)
                     expanded_whisper[start_idx + 1 : end_idx, :] = (
                         whisper_input_feature_i[:feat_len, :]
                     )