mazesmazes
/

tiny-audio

@@ -840,6 +840,22 @@ class ASRModel(PreTrainedModel):
         print(f"DEBUG generate_stream: num_audio_tokens={num_audio_tokens}", file=sys.stderr)
         print(f"DEBUG generate_stream: generate_kwargs={generate_kwargs}", file=sys.stderr)
         # Test: Try without threading first to see if that's the issue
         print(f"DEBUG: Testing non-threaded generation first", file=sys.stderr)
         test_output = self.decoder.generate(

         print(f"DEBUG generate_stream: num_audio_tokens={num_audio_tokens}", file=sys.stderr)
         print(f"DEBUG generate_stream: generate_kwargs={generate_kwargs}", file=sys.stderr)
+        # Debug: Check devices and values
+        print(f"DEBUG: inputs_embeds device={inputs_embeds.device}", file=sys.stderr)
+        print(f"DEBUG: expanded_prompt_ids device={expanded_prompt_ids.device}", file=sys.stderr)
+        print(f"DEBUG: attention_mask device={attention_mask.device}", file=sys.stderr)
+        print(f"DEBUG: decoder device={next(self.decoder.parameters()).device}", file=sys.stderr)
+        # Check if audio embeddings are non-zero
+        audio_mask = (expanded_prompt_ids == self.audio_token_id)
+        print(f"DEBUG: audio_mask sum={audio_mask.sum().item()} (should be {num_audio_tokens})", file=sys.stderr)
+        # Check a sample of the embeddings where audio should be
+        audio_positions = torch.where(audio_mask[0])[0]
+        if len(audio_positions) > 0:
+            sample_pos = audio_positions[0].item()
+            print(f"DEBUG: Sample audio embed at pos {sample_pos}: mean={inputs_embeds[0, sample_pos].mean().item():.4f}, std={inputs_embeds[0, sample_pos].std().item():.4f}", file=sys.stderr)
         # Test: Try without threading first to see if that's the issue
         print(f"DEBUG: Testing non-threaded generation first", file=sys.stderr)
         test_output = self.decoder.generate(