Spaces:
Sleeping
Sleeping
Peter Michael Gits Claude commited on
Commit Β·
072c9ef
1
Parent(s): befe3fe
feat: Add unmute.sh streaming text processing methodology
Browse files- Implement streaming TTS synthesis with text chunk buffering
- Add flush trick for optimal quality when is_final=true
- Support both legacy single-shot and new streaming message types
- New message types: tts_streaming_synthesize, tts_streaming_response
- Progress updates during buffering with tts_streaming_progress
- Streaming capability discovery via tts_get_streaming_info
- Follows unmute.sh methodology for real-time voice applications
π€ Generated with [Claude Code](https://claude.ai/code)
Co-Authored-By: Claude <noreply@anthropic.com>
app.py
CHANGED
|
@@ -380,51 +380,91 @@ class WebSocketTTSHandler:
|
|
| 380 |
safe_log("error", f"Failed to send message to TTS client {client_id}: {e}")
|
| 381 |
await self.disconnect(client_id)
|
| 382 |
|
| 383 |
-
async def
|
| 384 |
-
"""Process text synthesis
|
| 385 |
try:
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
# Use the existing ZeroGPU synthesize_speech function
|
| 389 |
-
audio_path, status = synthesize_speech(text, voice_preset)
|
| 390 |
|
| 391 |
-
if
|
| 392 |
-
#
|
| 393 |
-
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
|
| 399 |
-
|
| 400 |
-
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
|
| 405 |
-
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 420 |
else:
|
| 421 |
-
# Send
|
| 422 |
await self.send_message(client_id, {
|
| 423 |
-
"type": "
|
| 424 |
-
"message": f"
|
| 425 |
-
"
|
|
|
|
| 426 |
"timestamp": datetime.now().isoformat()
|
| 427 |
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 428 |
|
| 429 |
except Exception as e:
|
| 430 |
safe_log("error", f"TTS WebSocket error for {client_id}: {e}")
|
|
@@ -439,7 +479,7 @@ class WebSocketTTSHandler:
|
|
| 439 |
message_type = message_data.get("type")
|
| 440 |
|
| 441 |
if message_type == "tts_synthesize":
|
| 442 |
-
# Text-to-speech synthesis request
|
| 443 |
text = message_data.get("text", "")
|
| 444 |
voice_preset = message_data.get("voice_preset", "v2/en_speaker_6")
|
| 445 |
|
|
@@ -452,6 +492,22 @@ class WebSocketTTSHandler:
|
|
| 452 |
"timestamp": datetime.now().isoformat()
|
| 453 |
})
|
| 454 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 455 |
elif message_type == "tts_get_voices":
|
| 456 |
# Request available voice presets
|
| 457 |
await self.send_message(client_id, {
|
|
@@ -460,6 +516,22 @@ class WebSocketTTSHandler:
|
|
| 460 |
"timestamp": datetime.now().isoformat()
|
| 461 |
})
|
| 462 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 463 |
else:
|
| 464 |
safe_log("warning", f"Unknown TTS message type from {client_id}: {message_type}")
|
| 465 |
|
|
|
|
| 380 |
safe_log("error", f"Failed to send message to TTS client {client_id}: {e}")
|
| 381 |
await self.disconnect(client_id)
|
| 382 |
|
| 383 |
+
async def handle_streaming_text_synthesis(self, client_id: str, text_chunks: list, voice_preset: str = "v2/en_speaker_6", is_final: bool = True):
|
| 384 |
+
"""Process streaming text synthesis following unmute.sh methodology"""
|
| 385 |
try:
|
| 386 |
+
# UNMUTE.SH METHODOLOGY: Process text chunks in streaming fashion
|
| 387 |
+
safe_log("info", f"π TTS STREAMING: Processing {len(text_chunks)} chunks from {client_id} (final={is_final})")
|
|
|
|
|
|
|
| 388 |
|
| 389 |
+
if is_final:
|
| 390 |
+
# FLUSH TRICK: Process all accumulated text at once for best quality
|
| 391 |
+
complete_text = " ".join(text_chunks).strip()
|
| 392 |
+
if complete_text:
|
| 393 |
+
safe_log("info", f"π TTS FLUSH: Final synthesis for {client_id}: {complete_text[:50]}...")
|
| 394 |
+
|
| 395 |
+
# Use the existing ZeroGPU synthesize_speech function
|
| 396 |
+
audio_path, status = synthesize_speech(complete_text, voice_preset)
|
| 397 |
+
|
| 398 |
+
if audio_path and "β
" in status:
|
| 399 |
+
# Read the generated audio file
|
| 400 |
+
with open(audio_path, 'rb') as audio_file:
|
| 401 |
+
audio_data = audio_file.read()
|
| 402 |
+
|
| 403 |
+
# Encode audio as base64 for WebSocket transmission
|
| 404 |
+
audio_b64 = base64.b64encode(audio_data).decode('utf-8')
|
| 405 |
+
|
| 406 |
+
# Send successful synthesis with streaming metadata
|
| 407 |
+
await self.send_message(client_id, {
|
| 408 |
+
"type": "tts_streaming_response",
|
| 409 |
+
"audio_data": audio_b64,
|
| 410 |
+
"audio_format": "wav",
|
| 411 |
+
"text": complete_text,
|
| 412 |
+
"text_chunks": text_chunks,
|
| 413 |
+
"voice_preset": voice_preset,
|
| 414 |
+
"timestamp": datetime.now().isoformat(),
|
| 415 |
+
"audio_size": len(audio_data),
|
| 416 |
+
"status": status,
|
| 417 |
+
"is_final": is_final,
|
| 418 |
+
"streaming_method": "unmute.sh_flush_trick"
|
| 419 |
+
})
|
| 420 |
+
|
| 421 |
+
safe_log("info", f"π TTS STREAMING: Final audio sent to {client_id} ({len(audio_data)} bytes)")
|
| 422 |
+
|
| 423 |
+
# Clean up temporary file
|
| 424 |
+
import os
|
| 425 |
+
try:
|
| 426 |
+
os.unlink(audio_path)
|
| 427 |
+
except:
|
| 428 |
+
pass
|
| 429 |
+
else:
|
| 430 |
+
# Send error message
|
| 431 |
+
await self.send_message(client_id, {
|
| 432 |
+
"type": "tts_streaming_error",
|
| 433 |
+
"message": f"TTS streaming synthesis failed: {status}",
|
| 434 |
+
"text": complete_text,
|
| 435 |
+
"is_final": is_final,
|
| 436 |
+
"timestamp": datetime.now().isoformat()
|
| 437 |
+
})
|
| 438 |
+
else:
|
| 439 |
+
# Empty final flush
|
| 440 |
+
safe_log("info", f"π TTS FLUSH: Empty final text for {client_id}")
|
| 441 |
else:
|
| 442 |
+
# STREAMING: Send partial progress update (no audio yet)
|
| 443 |
await self.send_message(client_id, {
|
| 444 |
+
"type": "tts_streaming_progress",
|
| 445 |
+
"message": f"Buffering text chunks: {len(text_chunks)}",
|
| 446 |
+
"text_chunks": text_chunks[-3:], # Show last 3 chunks for progress
|
| 447 |
+
"is_final": is_final,
|
| 448 |
"timestamp": datetime.now().isoformat()
|
| 449 |
})
|
| 450 |
+
safe_log("info", f"π TTS STREAMING: Progress update sent to {client_id} ({len(text_chunks)} chunks)")
|
| 451 |
+
|
| 452 |
+
except Exception as e:
|
| 453 |
+
safe_log("error", f"TTS streaming error for {client_id}: {e}")
|
| 454 |
+
await self.send_message(client_id, {
|
| 455 |
+
"type": "tts_streaming_error",
|
| 456 |
+
"message": f"TTS streaming error: {str(e)}",
|
| 457 |
+
"is_final": is_final,
|
| 458 |
+
"timestamp": datetime.now().isoformat()
|
| 459 |
+
})
|
| 460 |
+
|
| 461 |
+
async def handle_text_synthesis(self, client_id: str, text: str, voice_preset: str = "v2/en_speaker_6"):
|
| 462 |
+
"""Process text synthesis with real TTS service (legacy single-shot method)"""
|
| 463 |
+
try:
|
| 464 |
+
safe_log("info", f"π TTS: Processing text from {client_id}: {text[:50]}...")
|
| 465 |
+
|
| 466 |
+
# Use streaming method with single chunk for consistency
|
| 467 |
+
await self.handle_streaming_text_synthesis(client_id, [text], voice_preset, is_final=True)
|
| 468 |
|
| 469 |
except Exception as e:
|
| 470 |
safe_log("error", f"TTS WebSocket error for {client_id}: {e}")
|
|
|
|
| 479 |
message_type = message_data.get("type")
|
| 480 |
|
| 481 |
if message_type == "tts_synthesize":
|
| 482 |
+
# Text-to-speech synthesis request (legacy single-shot)
|
| 483 |
text = message_data.get("text", "")
|
| 484 |
voice_preset = message_data.get("voice_preset", "v2/en_speaker_6")
|
| 485 |
|
|
|
|
| 492 |
"timestamp": datetime.now().isoformat()
|
| 493 |
})
|
| 494 |
|
| 495 |
+
elif message_type == "tts_streaming_synthesize":
|
| 496 |
+
# Streaming text-to-speech synthesis request (unmute.sh methodology)
|
| 497 |
+
text_chunks = message_data.get("text_chunks", [])
|
| 498 |
+
voice_preset = message_data.get("voice_preset", "v2/en_speaker_6")
|
| 499 |
+
is_final = message_data.get("is_final", True)
|
| 500 |
+
|
| 501 |
+
if text_chunks:
|
| 502 |
+
await self.handle_streaming_text_synthesis(client_id, text_chunks, voice_preset, is_final)
|
| 503 |
+
else:
|
| 504 |
+
await self.send_message(client_id, {
|
| 505 |
+
"type": "tts_streaming_error",
|
| 506 |
+
"message": "Empty text chunks provided for streaming synthesis",
|
| 507 |
+
"is_final": is_final,
|
| 508 |
+
"timestamp": datetime.now().isoformat()
|
| 509 |
+
})
|
| 510 |
+
|
| 511 |
elif message_type == "tts_get_voices":
|
| 512 |
# Request available voice presets
|
| 513 |
await self.send_message(client_id, {
|
|
|
|
| 516 |
"timestamp": datetime.now().isoformat()
|
| 517 |
})
|
| 518 |
|
| 519 |
+
elif message_type == "tts_get_streaming_info":
|
| 520 |
+
# Request streaming capabilities info
|
| 521 |
+
await self.send_message(client_id, {
|
| 522 |
+
"type": "tts_streaming_info",
|
| 523 |
+
"streaming_supported": True,
|
| 524 |
+
"methodology": "unmute.sh with flush trick",
|
| 525 |
+
"message_types": {
|
| 526 |
+
"tts_streaming_synthesize": "Send text chunks for streaming processing",
|
| 527 |
+
"tts_streaming_response": "Receive final audio with streaming metadata",
|
| 528 |
+
"tts_streaming_progress": "Receive progress updates during buffering",
|
| 529 |
+
"tts_streaming_error": "Receive streaming-specific error messages"
|
| 530 |
+
},
|
| 531 |
+
"flush_trick": "Set is_final=true to trigger synthesis of all buffered chunks",
|
| 532 |
+
"timestamp": datetime.now().isoformat()
|
| 533 |
+
})
|
| 534 |
+
|
| 535 |
else:
|
| 536 |
safe_log("warning", f"Unknown TTS message type from {client_id}: {message_type}")
|
| 537 |
|