mazesmazes
/

tiny-audio

@@ -698,27 +698,18 @@ class ASRModel(PreTrainedModel):
         generate_kwargs.setdefault("pad_token_id", self.tokenizer.pad_token_id)
         prompt_length = expanded_prompt_ids.shape[1]
-        # Generate with or without streamer
-        if streamer is not None:
-            generated_ids = self.decoder.generate(
-                input_ids=expanded_prompt_ids,
-                inputs_embeds=inputs_embeds,
-                attention_mask=attention_mask,
-                streamer=streamer,
-                **generate_kwargs,
-            )
-            # When using a streamer, return the full output (streamer will handle skipping prompt)
-            # The streamer needs the full sequence to properly identify what to skip
-            return generated_ids
-        else:
-            generated_ids = self.decoder.generate(
-                input_ids=expanded_prompt_ids,
-                inputs_embeds=inputs_embeds,
-                attention_mask=attention_mask,
-                **generate_kwargs,
-            )
-            # When not streaming, return only the new tokens (without prompt)
-            return generated_ids[:, prompt_length:]
     @torch.no_grad()
     def generate_stream(
@@ -753,10 +744,7 @@ class ASRModel(PreTrainedModel):
         # Run generation in a thread with streamer
         def generation_thread(future: futures.Future):
             try:
-                import sys
-                print("DEBUG: Starting generation thread", file=sys.stderr)
                 # Call generate with the streamer
-                # Important: This now returns the FULL sequence when streaming
                 result = self.generate(
                     input_values=input_values,
                     input_features=input_features,
@@ -766,24 +754,18 @@ class ASRModel(PreTrainedModel):
                     streamer=streamer,
                     **generate_kwargs,
                 )
-                print("DEBUG: Generation complete", file=sys.stderr)
                 future.set_result(result)
             except Exception as e:
-                print(f"DEBUG: Generation error: {e}", file=sys.stderr)
                 future.set_exception(e)
         future: futures.Future = futures.Future()
         thread = threading.Thread(target=generation_thread, args=(future,))
         thread.start()
-        print("DEBUG: Thread started", file=sys.stderr)
         # Stream the output - like Ultravox, just yield chunks as they come
         output_token_count = 0
-        import sys
-        print("DEBUG: Starting streaming iteration", file=sys.stderr)
         try:
             for chunk in streamer:
-                print(f"DEBUG: Got chunk: {repr(chunk)}", file=sys.stderr)
                 if chunk:  # Only yield non-empty chunks
                     output_token_count += 1
                     yield StreamChunk(chunk)
@@ -801,15 +783,6 @@ class ASRModel(PreTrainedModel):
             if future.exception():
                 raise future.exception()
-            # Debug: If no chunks were yielded, check what was generated
-            if output_token_count == 0:
-                import sys
-                result = future.result()
-                if result is not None:
-                    # Note: result now includes the full sequence (including prompt)
-                    # when streaming, so decode the full thing
-                    decoded = self.tokenizer.decode(result[0], skip_special_tokens=True)
-                    print(f"DEBUG: No chunks yielded but generated: {decoded}", file=sys.stderr)
         # For stats, estimate input tokens (we can't easily get exact count without duplicating work)
         # Rough estimate: prompt is about 20 tokens + 750 audio tokens

         generate_kwargs.setdefault("pad_token_id", self.tokenizer.pad_token_id)
         prompt_length = expanded_prompt_ids.shape[1]
+        # Generate (always returns full sequence, caller handles trimming)
+        generated_ids = self.decoder.generate(
+            input_ids=expanded_prompt_ids,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            streamer=streamer,
+            **generate_kwargs,
+        )
+        # Always return only the new tokens (without prompt)
+        # The streamer already got the full sequence during generation
+        return generated_ids[:, prompt_length:]
     @torch.no_grad()
     def generate_stream(
         # Run generation in a thread with streamer
         def generation_thread(future: futures.Future):
             try:
                 # Call generate with the streamer
                 result = self.generate(
                     input_values=input_values,
                     input_features=input_features,
                     streamer=streamer,
                     **generate_kwargs,
                 )
                 future.set_result(result)
             except Exception as e:
                 future.set_exception(e)
         future: futures.Future = futures.Future()
         thread = threading.Thread(target=generation_thread, args=(future,))
         thread.start()
         # Stream the output - like Ultravox, just yield chunks as they come
         output_token_count = 0
         try:
             for chunk in streamer:
                 if chunk:  # Only yield non-empty chunks
                     output_token_count += 1
                     yield StreamChunk(chunk)
             if future.exception():
                 raise future.exception()
         # For stats, estimate input tokens (we can't easily get exact count without duplicating work)
         # Rough estimate: prompt is about 20 tokens + 750 audio tokens