Spaces:

pgits
/

tts-gpu-service

Sleeping

Peter Michael Gits Claude commited on Aug 22, 2025

Commit

072c9ef

1 Parent(s): befe3fe

feat: Add unmute.sh streaming text processing methodology

- Implement streaming TTS synthesis with text chunk buffering
- Add flush trick for optimal quality when is_final=true
- Support both legacy single-shot and new streaming message types
- New message types: tts_streaming_synthesize, tts_streaming_response
- Progress updates during buffering with tts_streaming_progress
- Streaming capability discovery via tts_get_streaming_info
- Follows unmute.sh methodology for real-time voice applications

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>

Files changed (1) hide show

app.py +112 -40

app.py CHANGED Viewed

@@ -380,51 +380,91 @@ class WebSocketTTSHandler:
                 safe_log("error", f"Failed to send message to TTS client {client_id}: {e}")
                 await self.disconnect(client_id)
-    async def handle_text_synthesis(self, client_id: str, text: str, voice_preset: str = "v2/en_speaker_6"):
-        """Process text synthesis with real TTS service"""
         try:
-            safe_log("info", f"🔊 TTS: Processing text from {client_id}: {text[:50]}...")
-            # Use the existing ZeroGPU synthesize_speech function
-            audio_path, status = synthesize_speech(text, voice_preset)
-            if audio_path and "✅" in status:
-                # Read the generated audio file
-                with open(audio_path, 'rb') as audio_file:
-                    audio_data = audio_file.read()
-                # Encode audio as base64 for WebSocket transmission
-                audio_b64 = base64.b64encode(audio_data).decode('utf-8')
-                # Send successful synthesis
-                await self.send_message(client_id, {
-                    "type": "tts_audio_response",
-                    "audio_data": audio_b64,
-                    "audio_format": "wav",
-                    "text": text,
-                    "voice_preset": voice_preset,
-                    "timestamp": datetime.now().isoformat(),
-                    "audio_size": len(audio_data),
-                    "status": status
-                })
-                safe_log("info", f"🔊 TTS: Audio response sent to {client_id} ({len(audio_data)} bytes)")
-                # Clean up temporary file
-                import os
-                try:
-                    os.unlink(audio_path)
-                except:
-                    pass
             else:
-                # Send error message
                 await self.send_message(client_id, {
-                    "type": "tts_error",
-                    "message": f"TTS synthesis failed: {status}",
-                    "text": text,
                     "timestamp": datetime.now().isoformat()
                 })
         except Exception as e:
             safe_log("error", f"TTS WebSocket error for {client_id}: {e}")
@@ -439,7 +479,7 @@ class WebSocketTTSHandler:
         message_type = message_data.get("type")
         if message_type == "tts_synthesize":
-            # Text-to-speech synthesis request
             text = message_data.get("text", "")
             voice_preset = message_data.get("voice_preset", "v2/en_speaker_6")
@@ -452,6 +492,22 @@ class WebSocketTTSHandler:
                     "timestamp": datetime.now().isoformat()
                 })
         elif message_type == "tts_get_voices":
             # Request available voice presets
             await self.send_message(client_id, {
@@ -460,6 +516,22 @@ class WebSocketTTSHandler:
                 "timestamp": datetime.now().isoformat()
             })
         else:
             safe_log("warning", f"Unknown TTS message type from {client_id}: {message_type}")

                 safe_log("error", f"Failed to send message to TTS client {client_id}: {e}")
                 await self.disconnect(client_id)
+    async def handle_streaming_text_synthesis(self, client_id: str, text_chunks: list, voice_preset: str = "v2/en_speaker_6", is_final: bool = True):
+        """Process streaming text synthesis following unmute.sh methodology"""
         try:
+            # UNMUTE.SH METHODOLOGY: Process text chunks in streaming fashion
+            safe_log("info", f"🔊 TTS STREAMING: Processing {len(text_chunks)} chunks from {client_id} (final={is_final})")
+            if is_final:
+                # FLUSH TRICK: Process all accumulated text at once for best quality
+                complete_text = " ".join(text_chunks).strip()
+                if complete_text:
+                    safe_log("info", f"🔊 TTS FLUSH: Final synthesis for {client_id}: {complete_text[:50]}...")
+                    # Use the existing ZeroGPU synthesize_speech function
+                    audio_path, status = synthesize_speech(complete_text, voice_preset)
+                    if audio_path and "✅" in status:
+                        # Read the generated audio file
+                        with open(audio_path, 'rb') as audio_file:
+                            audio_data = audio_file.read()
+                        # Encode audio as base64 for WebSocket transmission
+                        audio_b64 = base64.b64encode(audio_data).decode('utf-8')
+                        # Send successful synthesis with streaming metadata
+                        await self.send_message(client_id, {
+                            "type": "tts_streaming_response",
+                            "audio_data": audio_b64,
+                            "audio_format": "wav",
+                            "text": complete_text,
+                            "text_chunks": text_chunks,
+                            "voice_preset": voice_preset,
+                            "timestamp": datetime.now().isoformat(),
+                            "audio_size": len(audio_data),
+                            "status": status,
+                            "is_final": is_final,
+                            "streaming_method": "unmute.sh_flush_trick"
+                        })
+                        safe_log("info", f"🔊 TTS STREAMING: Final audio sent to {client_id} ({len(audio_data)} bytes)")
+                        # Clean up temporary file
+                        import os
+                        try:
+                            os.unlink(audio_path)
+                        except:
+                            pass
+                    else:
+                        # Send error message
+                        await self.send_message(client_id, {
+                            "type": "tts_streaming_error",
+                            "message": f"TTS streaming synthesis failed: {status}",
+                            "text": complete_text,
+                            "is_final": is_final,
+                            "timestamp": datetime.now().isoformat()
+                        })
+                else:
+                    # Empty final flush
+                    safe_log("info", f"🔊 TTS FLUSH: Empty final text for {client_id}")
             else:
+                # STREAMING: Send partial progress update (no audio yet)
                 await self.send_message(client_id, {
+                    "type": "tts_streaming_progress",
+                    "message": f"Buffering text chunks: {len(text_chunks)}",
+                    "text_chunks": text_chunks[-3:],  # Show last 3 chunks for progress
+                    "is_final": is_final,
                     "timestamp": datetime.now().isoformat()
                 })
+                safe_log("info", f"🔊 TTS STREAMING: Progress update sent to {client_id} ({len(text_chunks)} chunks)")
+        except Exception as e:
+            safe_log("error", f"TTS streaming error for {client_id}: {e}")
+            await self.send_message(client_id, {
+                "type": "tts_streaming_error",
+                "message": f"TTS streaming error: {str(e)}",
+                "is_final": is_final,
+                "timestamp": datetime.now().isoformat()
+            })
+    async def handle_text_synthesis(self, client_id: str, text: str, voice_preset: str = "v2/en_speaker_6"):
+        """Process text synthesis with real TTS service (legacy single-shot method)"""
+        try:
+            safe_log("info", f"🔊 TTS: Processing text from {client_id}: {text[:50]}...")
+            # Use streaming method with single chunk for consistency
+            await self.handle_streaming_text_synthesis(client_id, [text], voice_preset, is_final=True)
         except Exception as e:
             safe_log("error", f"TTS WebSocket error for {client_id}: {e}")
         message_type = message_data.get("type")
         if message_type == "tts_synthesize":
+            # Text-to-speech synthesis request (legacy single-shot)
             text = message_data.get("text", "")
             voice_preset = message_data.get("voice_preset", "v2/en_speaker_6")
                     "timestamp": datetime.now().isoformat()
                 })
+        elif message_type == "tts_streaming_synthesize":
+            # Streaming text-to-speech synthesis request (unmute.sh methodology)
+            text_chunks = message_data.get("text_chunks", [])
+            voice_preset = message_data.get("voice_preset", "v2/en_speaker_6")
+            is_final = message_data.get("is_final", True)
+            if text_chunks:
+                await self.handle_streaming_text_synthesis(client_id, text_chunks, voice_preset, is_final)
+            else:
+                await self.send_message(client_id, {
+                    "type": "tts_streaming_error",
+                    "message": "Empty text chunks provided for streaming synthesis",
+                    "is_final": is_final,
+                    "timestamp": datetime.now().isoformat()
+                })
         elif message_type == "tts_get_voices":
             # Request available voice presets
             await self.send_message(client_id, {
                 "timestamp": datetime.now().isoformat()
             })
+        elif message_type == "tts_get_streaming_info":
+            # Request streaming capabilities info
+            await self.send_message(client_id, {
+                "type": "tts_streaming_info",
+                "streaming_supported": True,
+                "methodology": "unmute.sh with flush trick",
+                "message_types": {
+                    "tts_streaming_synthesize": "Send text chunks for streaming processing",
+                    "tts_streaming_response": "Receive final audio with streaming metadata",
+                    "tts_streaming_progress": "Receive progress updates during buffering",
+                    "tts_streaming_error": "Receive streaming-specific error messages"
+                },
+                "flush_trick": "Set is_final=true to trigger synthesis of all buffered chunks",
+                "timestamp": datetime.now().isoformat()
+            })
         else:
             safe_log("warning", f"Unknown TTS message type from {client_id}: {message_type}")