Peter Michael Gits Claude commited on
Commit
072c9ef
Β·
1 Parent(s): befe3fe

feat: Add unmute.sh streaming text processing methodology

Browse files

- Implement streaming TTS synthesis with text chunk buffering
- Add flush trick for optimal quality when is_final=true
- Support both legacy single-shot and new streaming message types
- New message types: tts_streaming_synthesize, tts_streaming_response
- Progress updates during buffering with tts_streaming_progress
- Streaming capability discovery via tts_get_streaming_info
- Follows unmute.sh methodology for real-time voice applications

πŸ€– Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>

Files changed (1) hide show
  1. app.py +112 -40
app.py CHANGED
@@ -380,51 +380,91 @@ class WebSocketTTSHandler:
380
  safe_log("error", f"Failed to send message to TTS client {client_id}: {e}")
381
  await self.disconnect(client_id)
382
 
383
- async def handle_text_synthesis(self, client_id: str, text: str, voice_preset: str = "v2/en_speaker_6"):
384
- """Process text synthesis with real TTS service"""
385
  try:
386
- safe_log("info", f"πŸ”Š TTS: Processing text from {client_id}: {text[:50]}...")
387
-
388
- # Use the existing ZeroGPU synthesize_speech function
389
- audio_path, status = synthesize_speech(text, voice_preset)
390
 
391
- if audio_path and "βœ…" in status:
392
- # Read the generated audio file
393
- with open(audio_path, 'rb') as audio_file:
394
- audio_data = audio_file.read()
395
-
396
- # Encode audio as base64 for WebSocket transmission
397
- audio_b64 = base64.b64encode(audio_data).decode('utf-8')
398
-
399
- # Send successful synthesis
400
- await self.send_message(client_id, {
401
- "type": "tts_audio_response",
402
- "audio_data": audio_b64,
403
- "audio_format": "wav",
404
- "text": text,
405
- "voice_preset": voice_preset,
406
- "timestamp": datetime.now().isoformat(),
407
- "audio_size": len(audio_data),
408
- "status": status
409
- })
410
-
411
- safe_log("info", f"πŸ”Š TTS: Audio response sent to {client_id} ({len(audio_data)} bytes)")
412
-
413
- # Clean up temporary file
414
- import os
415
- try:
416
- os.unlink(audio_path)
417
- except:
418
- pass
419
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
420
  else:
421
- # Send error message
422
  await self.send_message(client_id, {
423
- "type": "tts_error",
424
- "message": f"TTS synthesis failed: {status}",
425
- "text": text,
 
426
  "timestamp": datetime.now().isoformat()
427
  })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
428
 
429
  except Exception as e:
430
  safe_log("error", f"TTS WebSocket error for {client_id}: {e}")
@@ -439,7 +479,7 @@ class WebSocketTTSHandler:
439
  message_type = message_data.get("type")
440
 
441
  if message_type == "tts_synthesize":
442
- # Text-to-speech synthesis request
443
  text = message_data.get("text", "")
444
  voice_preset = message_data.get("voice_preset", "v2/en_speaker_6")
445
 
@@ -452,6 +492,22 @@ class WebSocketTTSHandler:
452
  "timestamp": datetime.now().isoformat()
453
  })
454
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
455
  elif message_type == "tts_get_voices":
456
  # Request available voice presets
457
  await self.send_message(client_id, {
@@ -460,6 +516,22 @@ class WebSocketTTSHandler:
460
  "timestamp": datetime.now().isoformat()
461
  })
462
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
463
  else:
464
  safe_log("warning", f"Unknown TTS message type from {client_id}: {message_type}")
465
 
 
380
  safe_log("error", f"Failed to send message to TTS client {client_id}: {e}")
381
  await self.disconnect(client_id)
382
 
383
+ async def handle_streaming_text_synthesis(self, client_id: str, text_chunks: list, voice_preset: str = "v2/en_speaker_6", is_final: bool = True):
384
+ """Process streaming text synthesis following unmute.sh methodology"""
385
  try:
386
+ # UNMUTE.SH METHODOLOGY: Process text chunks in streaming fashion
387
+ safe_log("info", f"πŸ”Š TTS STREAMING: Processing {len(text_chunks)} chunks from {client_id} (final={is_final})")
 
 
388
 
389
+ if is_final:
390
+ # FLUSH TRICK: Process all accumulated text at once for best quality
391
+ complete_text = " ".join(text_chunks).strip()
392
+ if complete_text:
393
+ safe_log("info", f"πŸ”Š TTS FLUSH: Final synthesis for {client_id}: {complete_text[:50]}...")
394
+
395
+ # Use the existing ZeroGPU synthesize_speech function
396
+ audio_path, status = synthesize_speech(complete_text, voice_preset)
397
+
398
+ if audio_path and "βœ…" in status:
399
+ # Read the generated audio file
400
+ with open(audio_path, 'rb') as audio_file:
401
+ audio_data = audio_file.read()
402
+
403
+ # Encode audio as base64 for WebSocket transmission
404
+ audio_b64 = base64.b64encode(audio_data).decode('utf-8')
405
+
406
+ # Send successful synthesis with streaming metadata
407
+ await self.send_message(client_id, {
408
+ "type": "tts_streaming_response",
409
+ "audio_data": audio_b64,
410
+ "audio_format": "wav",
411
+ "text": complete_text,
412
+ "text_chunks": text_chunks,
413
+ "voice_preset": voice_preset,
414
+ "timestamp": datetime.now().isoformat(),
415
+ "audio_size": len(audio_data),
416
+ "status": status,
417
+ "is_final": is_final,
418
+ "streaming_method": "unmute.sh_flush_trick"
419
+ })
420
+
421
+ safe_log("info", f"πŸ”Š TTS STREAMING: Final audio sent to {client_id} ({len(audio_data)} bytes)")
422
+
423
+ # Clean up temporary file
424
+ import os
425
+ try:
426
+ os.unlink(audio_path)
427
+ except:
428
+ pass
429
+ else:
430
+ # Send error message
431
+ await self.send_message(client_id, {
432
+ "type": "tts_streaming_error",
433
+ "message": f"TTS streaming synthesis failed: {status}",
434
+ "text": complete_text,
435
+ "is_final": is_final,
436
+ "timestamp": datetime.now().isoformat()
437
+ })
438
+ else:
439
+ # Empty final flush
440
+ safe_log("info", f"πŸ”Š TTS FLUSH: Empty final text for {client_id}")
441
  else:
442
+ # STREAMING: Send partial progress update (no audio yet)
443
  await self.send_message(client_id, {
444
+ "type": "tts_streaming_progress",
445
+ "message": f"Buffering text chunks: {len(text_chunks)}",
446
+ "text_chunks": text_chunks[-3:], # Show last 3 chunks for progress
447
+ "is_final": is_final,
448
  "timestamp": datetime.now().isoformat()
449
  })
450
+ safe_log("info", f"πŸ”Š TTS STREAMING: Progress update sent to {client_id} ({len(text_chunks)} chunks)")
451
+
452
+ except Exception as e:
453
+ safe_log("error", f"TTS streaming error for {client_id}: {e}")
454
+ await self.send_message(client_id, {
455
+ "type": "tts_streaming_error",
456
+ "message": f"TTS streaming error: {str(e)}",
457
+ "is_final": is_final,
458
+ "timestamp": datetime.now().isoformat()
459
+ })
460
+
461
+ async def handle_text_synthesis(self, client_id: str, text: str, voice_preset: str = "v2/en_speaker_6"):
462
+ """Process text synthesis with real TTS service (legacy single-shot method)"""
463
+ try:
464
+ safe_log("info", f"πŸ”Š TTS: Processing text from {client_id}: {text[:50]}...")
465
+
466
+ # Use streaming method with single chunk for consistency
467
+ await self.handle_streaming_text_synthesis(client_id, [text], voice_preset, is_final=True)
468
 
469
  except Exception as e:
470
  safe_log("error", f"TTS WebSocket error for {client_id}: {e}")
 
479
  message_type = message_data.get("type")
480
 
481
  if message_type == "tts_synthesize":
482
+ # Text-to-speech synthesis request (legacy single-shot)
483
  text = message_data.get("text", "")
484
  voice_preset = message_data.get("voice_preset", "v2/en_speaker_6")
485
 
 
492
  "timestamp": datetime.now().isoformat()
493
  })
494
 
495
+ elif message_type == "tts_streaming_synthesize":
496
+ # Streaming text-to-speech synthesis request (unmute.sh methodology)
497
+ text_chunks = message_data.get("text_chunks", [])
498
+ voice_preset = message_data.get("voice_preset", "v2/en_speaker_6")
499
+ is_final = message_data.get("is_final", True)
500
+
501
+ if text_chunks:
502
+ await self.handle_streaming_text_synthesis(client_id, text_chunks, voice_preset, is_final)
503
+ else:
504
+ await self.send_message(client_id, {
505
+ "type": "tts_streaming_error",
506
+ "message": "Empty text chunks provided for streaming synthesis",
507
+ "is_final": is_final,
508
+ "timestamp": datetime.now().isoformat()
509
+ })
510
+
511
  elif message_type == "tts_get_voices":
512
  # Request available voice presets
513
  await self.send_message(client_id, {
 
516
  "timestamp": datetime.now().isoformat()
517
  })
518
 
519
+ elif message_type == "tts_get_streaming_info":
520
+ # Request streaming capabilities info
521
+ await self.send_message(client_id, {
522
+ "type": "tts_streaming_info",
523
+ "streaming_supported": True,
524
+ "methodology": "unmute.sh with flush trick",
525
+ "message_types": {
526
+ "tts_streaming_synthesize": "Send text chunks for streaming processing",
527
+ "tts_streaming_response": "Receive final audio with streaming metadata",
528
+ "tts_streaming_progress": "Receive progress updates during buffering",
529
+ "tts_streaming_error": "Receive streaming-specific error messages"
530
+ },
531
+ "flush_trick": "Set is_final=true to trigger synthesis of all buffered chunks",
532
+ "timestamp": datetime.now().isoformat()
533
+ })
534
+
535
  else:
536
  safe_log("warning", f"Unknown TTS message type from {client_id}: {message_type}")
537