mazesmazes
/

tiny-audio

@@ -733,30 +733,28 @@ class ASRModel(PreTrainedModel):
         """
         Stream generation by using the working generate() method with a TextIteratorStreamer.
         """
-        # Set up the streamer
-        # Note: skip_prompt=True means it won't output the prompt tokens
-        # This should start streaming from the first NEW generated token
         streamer = TextIteratorStreamer(
             self.tokenizer,
-            skip_prompt=True,
             skip_special_tokens=True,
-            timeout=30.0  # Add timeout to prevent hanging
         )
-        # Count prompt length for stats
-        # We need to encode just to get the prompt length
         audio_inputs = input_values if input_values is not None else input_features
         if audio_inputs is None:
             raise ValueError("input_values or input_features must be provided")
-        # Simple way to get prompt length - just count audio tokens
         import threading
         from concurrent import futures
         # Run generation in a thread with streamer
         def generation_thread(future: futures.Future):
             try:
-                # Just call the working generate method with the streamer
                 result = self.generate(
                     input_values=input_values,
                     input_features=input_features,
@@ -774,17 +772,17 @@ class ASRModel(PreTrainedModel):
         thread = threading.Thread(target=generation_thread, args=(future,))
         thread.start()
-        # Stream the output
         output_token_count = 0
         try:
             for chunk in streamer:
-                if chunk:
                     output_token_count += 1
                     yield StreamChunk(chunk)
         except Exception as e:
             # Check if it's the Empty exception from queue
             if e.__class__.__name__ == "Empty":
-                # This is expected when streaming completes quickly
                 pass
             else:
                 # Re-raise other exceptions

         """
         Stream generation by using the working generate() method with a TextIteratorStreamer.
         """
+        # Set up the streamer - use skip_prompt=True like Ultravox
+        # The key is that when we return the full sequence from generate(),
+        # the streamer can properly identify and skip the prompt
         streamer = TextIteratorStreamer(
             self.tokenizer,
+            skip_prompt=True,  # Skip the prompt tokens
             skip_special_tokens=True,
+            timeout=30.0
         )
         audio_inputs = input_values if input_values is not None else input_features
         if audio_inputs is None:
             raise ValueError("input_values or input_features must be provided")
         import threading
         from concurrent import futures
         # Run generation in a thread with streamer
         def generation_thread(future: futures.Future):
             try:
+                # Call generate with the streamer
+                # Important: This now returns the FULL sequence when streaming
                 result = self.generate(
                     input_values=input_values,
                     input_features=input_features,
         thread = threading.Thread(target=generation_thread, args=(future,))
         thread.start()
+        # Stream the output - like Ultravox, just yield chunks as they come
         output_token_count = 0
         try:
             for chunk in streamer:
+                if chunk:  # Only yield non-empty chunks
                     output_token_count += 1
                     yield StreamChunk(chunk)
         except Exception as e:
             # Check if it's the Empty exception from queue
             if e.__class__.__name__ == "Empty":
+                # This happens when generation completes before we start iterating
                 pass
             else:
                 # Re-raise other exceptions