Peter Michael Gits Claude commited on
Commit
228bc17
Β·
1 Parent(s): ccbd055

feat: Complete TTS playback integration for voice responses v0.4.9

Browse files

- Added WebSocket TTS handler to TTS service with ZeroGPU synthesis
- Integrated TTS WebSocket client in ChatCal WebRTC handler
- Real-time text-to-speech with base64 audio transmission
- Auto-generate demo TTS responses after STT transcription
- Client-side audio playback with proper error handling
- Complete voice interaction loop: Speech β†’ Text β†’ Response β†’ Audio

πŸ€– Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>

version.py CHANGED
@@ -2,8 +2,8 @@
2
  Version information for ChatCal Voice-Enabled AI Assistant
3
  """
4
 
5
- __version__ = "0.4.8"
6
- __build_date__ = "2025-08-20T16:45:00"
7
  __description__ = "Voice-Enabled ChatCal AI Assistant with Hugging Face deployment"
8
 
9
  def get_version_info():
 
2
  Version information for ChatCal Voice-Enabled AI Assistant
3
  """
4
 
5
+ __version__ = "0.4.9"
6
+ __build_date__ = "2025-08-20T17:00:00"
7
  __description__ = "Voice-Enabled ChatCal AI Assistant with Hugging Face deployment"
8
 
9
  def get_version_info():
webrtc/server/fastapi_integration.py CHANGED
@@ -158,6 +158,19 @@ def create_fastapi_app() -> FastAPI:
158
 
159
  if (data.type === 'transcription') {
160
  addTranscription(data.text, data.timestamp);
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  } else if (data.type === 'error') {
162
  addTranscription(`Error: ${data.message}`, data.timestamp, true);
163
  }
@@ -250,6 +263,61 @@ def create_fastapi_app() -> FastAPI:
250
  }
251
  }
252
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
253
  // Event listeners
254
  recordBtn.addEventListener('click', startRecording);
255
  stopBtn.addEventListener('click', stopRecording);
 
158
 
159
  if (data.type === 'transcription') {
160
  addTranscription(data.text, data.timestamp);
161
+
162
+ // Auto-generate TTS response for demo
163
+ if (data.text && data.text.trim()) {
164
+ const demoResponse = `I heard you say: "${data.text}". This is a demo TTS response.`;
165
+ setTimeout(() => {
166
+ requestTTSPlayback(demoResponse);
167
+ }, 1000); // Wait 1 second before TTS response
168
+ }
169
+ } else if (data.type === 'tts_playback') {
170
+ playTTSAudio(data.audio_data, data.text);
171
+ } else if (data.type === 'tts_error') {
172
+ console.error('TTS Error:', data.message);
173
+ addTranscription(`TTS Error: ${data.message}`, data.timestamp, true);
174
  } else if (data.type === 'error') {
175
  addTranscription(`Error: ${data.message}`, data.timestamp, true);
176
  }
 
263
  }
264
  }
265
 
266
+ function requestTTSPlayback(text, voicePreset = 'v2/en_speaker_6') {
267
+ console.log('Requesting TTS playback:', text);
268
+ if (websocket && websocket.readyState === WebSocket.OPEN) {
269
+ websocket.send(JSON.stringify({
270
+ type: 'tts_request',
271
+ text: text,
272
+ voice_preset: voicePreset
273
+ }));
274
+ } else {
275
+ console.error('WebSocket not available for TTS request');
276
+ }
277
+ }
278
+
279
+ function playTTSAudio(audioBase64, text) {
280
+ console.log('Playing TTS audio for:', text);
281
+ try {
282
+ // Convert base64 to audio blob
283
+ const audioData = atob(audioBase64);
284
+ const arrayBuffer = new ArrayBuffer(audioData.length);
285
+ const uint8Array = new Uint8Array(arrayBuffer);
286
+
287
+ for (let i = 0; i < audioData.length; i++) {
288
+ uint8Array[i] = audioData.charCodeAt(i);
289
+ }
290
+
291
+ const audioBlob = new Blob([arrayBuffer], { type: 'audio/wav' });
292
+ const audioUrl = URL.createObjectURL(audioBlob);
293
+
294
+ const audio = new Audio(audioUrl);
295
+ audio.onloadeddata = () => {
296
+ console.log('TTS audio loaded, playing...');
297
+ addTranscription(`πŸ”Š Playing: ${text}`, new Date().toISOString(), false);
298
+ };
299
+
300
+ audio.onended = () => {
301
+ console.log('TTS audio finished playing');
302
+ URL.revokeObjectURL(audioUrl); // Clean up
303
+ };
304
+
305
+ audio.onerror = (error) => {
306
+ console.error('TTS audio playback error:', error);
307
+ addTranscription(`TTS Playback Error: ${error}`, new Date().toISOString(), true);
308
+ };
309
+
310
+ audio.play().catch(error => {
311
+ console.error('Failed to play TTS audio:', error);
312
+ addTranscription(`TTS Play Error: User interaction may be required`, new Date().toISOString(), true);
313
+ });
314
+
315
+ } catch (error) {
316
+ console.error('Error processing TTS audio:', error);
317
+ addTranscription(`TTS Processing Error: ${error}`, new Date().toISOString(), true);
318
+ }
319
+ }
320
+
321
  // Event listeners
322
  recordBtn.addEventListener('click', startRecording);
323
  stopBtn.addEventListener('click', stopRecording);
webrtc/server/websocket_handler.py CHANGED
@@ -27,6 +27,10 @@ class WebRTCHandler:
27
  self.stt_websocket_url = "wss://pgits-stt-gpu-service.hf.space/ws/stt"
28
  self.stt_connections: Dict[str, websockets.WebSocketClientProtocol] = {}
29
 
 
 
 
 
30
  async def connect(self, websocket: WebSocket, client_id: str):
31
  """Accept WebSocket connection and initialize audio buffer"""
32
  await websocket.accept()
@@ -56,6 +60,9 @@ class WebRTCHandler:
56
  # Clean up STT connection if exists
57
  await self.disconnect_from_stt_service(client_id)
58
 
 
 
 
59
  logger.info(f"πŸ”Œ WebRTC client {client_id} disconnected")
60
 
61
  async def send_message(self, client_id: str, message: dict):
@@ -196,6 +203,130 @@ class WebRTCHandler:
196
  # Cleanup connection on error
197
  await self.disconnect_from_stt_service(client_id)
198
  return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
 
200
  async def process_audio_file_webrtc(self, audio_file_path: str, sample_rate: int) -> Optional[str]:
201
  """Process audio file with real STT service via WebSocket"""
@@ -280,6 +411,28 @@ class WebRTCHandler:
280
  })
281
  logger.info(f"🎀 Recording stopped for {client_id}")
282
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
283
  else:
284
  logger.warning(f"Unknown message type from {client_id}: {message_type}")
285
 
 
27
  self.stt_websocket_url = "wss://pgits-stt-gpu-service.hf.space/ws/stt"
28
  self.stt_connections: Dict[str, websockets.WebSocketClientProtocol] = {}
29
 
30
+ self.tts_service_url = "https://pgits-tts-gpu-service.hf.space"
31
+ self.tts_websocket_url = "wss://pgits-tts-gpu-service.hf.space/ws/tts"
32
+ self.tts_connections: Dict[str, websockets.WebSocketClientProtocol] = {}
33
+
34
  async def connect(self, websocket: WebSocket, client_id: str):
35
  """Accept WebSocket connection and initialize audio buffer"""
36
  await websocket.accept()
 
60
  # Clean up STT connection if exists
61
  await self.disconnect_from_stt_service(client_id)
62
 
63
+ # Clean up TTS connection if exists
64
+ await self.disconnect_from_tts_service(client_id)
65
+
66
  logger.info(f"πŸ”Œ WebRTC client {client_id} disconnected")
67
 
68
  async def send_message(self, client_id: str, message: dict):
 
203
  # Cleanup connection on error
204
  await self.disconnect_from_stt_service(client_id)
205
  return None
206
+
207
+ # TTS WebSocket Methods
208
+ async def connect_to_tts_service(self, client_id: str) -> bool:
209
+ """Connect to the TTS WebSocket service"""
210
+ try:
211
+ logger.info(f"πŸ”Œ Connecting to TTS service for client {client_id}: {self.tts_websocket_url}")
212
+
213
+ # Connect to TTS WebSocket service
214
+ tts_ws = await websockets.connect(self.tts_websocket_url)
215
+ self.tts_connections[client_id] = tts_ws
216
+
217
+ # Wait for connection confirmation
218
+ confirmation = await tts_ws.recv()
219
+ confirmation_data = json.loads(confirmation)
220
+
221
+ if confirmation_data.get("type") == "tts_connection_confirmed":
222
+ logger.info(f"βœ… TTS service connected for client {client_id}")
223
+ return True
224
+ else:
225
+ logger.warning(f"⚠️ Unexpected TTS confirmation: {confirmation_data}")
226
+ return False
227
+
228
+ except Exception as e:
229
+ logger.error(f"❌ Failed to connect to TTS service for {client_id}: {e}")
230
+ return False
231
+
232
+ async def disconnect_from_tts_service(self, client_id: str):
233
+ """Disconnect from TTS WebSocket service"""
234
+ if client_id in self.tts_connections:
235
+ try:
236
+ tts_ws = self.tts_connections[client_id]
237
+ await tts_ws.close()
238
+ del self.tts_connections[client_id]
239
+ logger.info(f"πŸ”Œ Disconnected from TTS service for client {client_id}")
240
+ except Exception as e:
241
+ logger.error(f"Error disconnecting from TTS service: {e}")
242
+
243
+ async def send_text_to_tts_service(self, client_id: str, text: str, voice_preset: str = "v2/en_speaker_6") -> Optional[bytes]:
244
+ """Send text to TTS service and get audio response"""
245
+ if client_id not in self.tts_connections:
246
+ # Try to connect if not already connected
247
+ success = await self.connect_to_tts_service(client_id)
248
+ if not success:
249
+ return None
250
+
251
+ try:
252
+ tts_ws = self.tts_connections[client_id]
253
+
254
+ # Send TTS synthesis message
255
+ message = {
256
+ "type": "tts_synthesize",
257
+ "text": text,
258
+ "voice_preset": voice_preset
259
+ }
260
+
261
+ await tts_ws.send(json.dumps(message))
262
+ logger.info(f"πŸ“€ Sent text to TTS service: {text[:50]}...")
263
+
264
+ # Wait for audio response
265
+ response = await tts_ws.recv()
266
+ response_data = json.loads(response)
267
+
268
+ if response_data.get("type") == "tts_audio_response":
269
+ # Decode base64 audio data
270
+ audio_b64 = response_data.get("audio_data", "")
271
+ audio_bytes = base64.b64decode(audio_b64)
272
+ logger.info(f"πŸ”Š TTS audio received: {len(audio_bytes)} bytes")
273
+ return audio_bytes
274
+ elif response_data.get("type") == "tts_error":
275
+ error_msg = response_data.get("message", "Unknown TTS error")
276
+ logger.error(f"❌ TTS service error: {error_msg}")
277
+ return None
278
+ else:
279
+ logger.warning(f"⚠️ Unexpected TTS response: {response_data}")
280
+ return None
281
+
282
+ except Exception as e:
283
+ logger.error(f"❌ Error communicating with TTS service: {e}")
284
+ # Cleanup connection on error
285
+ await self.disconnect_from_tts_service(client_id)
286
+ return None
287
+
288
+ async def play_tts_response(self, client_id: str, text: str, voice_preset: str = "v2/en_speaker_6"):
289
+ """Generate TTS audio and send to client for playback"""
290
+ try:
291
+ logger.info(f"πŸ”Š Generating TTS response for client {client_id}: {text[:50]}...")
292
+
293
+ # Get audio from TTS service
294
+ audio_data = await self.send_text_to_tts_service(client_id, text, voice_preset)
295
+
296
+ if audio_data:
297
+ # Convert audio to base64 for WebSocket transmission
298
+ audio_b64 = base64.b64encode(audio_data).decode('utf-8')
299
+
300
+ # Send audio playback message to client
301
+ await self.send_message(client_id, {
302
+ "type": "tts_playback",
303
+ "audio_data": audio_b64,
304
+ "audio_format": "wav",
305
+ "text": text,
306
+ "voice_preset": voice_preset,
307
+ "timestamp": datetime.now().isoformat(),
308
+ "audio_size": len(audio_data)
309
+ })
310
+
311
+ logger.info(f"πŸ”Š TTS playback sent to {client_id} ({len(audio_data)} bytes)")
312
+ else:
313
+ logger.warning(f"⚠️ TTS service failed to generate audio for: {text[:50]}...")
314
+
315
+ # Send error message
316
+ await self.send_message(client_id, {
317
+ "type": "tts_error",
318
+ "message": "TTS audio generation failed",
319
+ "text": text,
320
+ "timestamp": datetime.now().isoformat()
321
+ })
322
+
323
+ except Exception as e:
324
+ logger.error(f"❌ TTS playback error for {client_id}: {e}")
325
+ await self.send_message(client_id, {
326
+ "type": "tts_error",
327
+ "message": f"TTS playback error: {str(e)}",
328
+ "timestamp": datetime.now().isoformat()
329
+ })
330
 
331
  async def process_audio_file_webrtc(self, audio_file_path: str, sample_rate: int) -> Optional[str]:
332
  """Process audio file with real STT service via WebSocket"""
 
411
  })
412
  logger.info(f"🎀 Recording stopped for {client_id}")
413
 
414
+ elif message_type == "tts_request":
415
+ # Client requesting TTS playback
416
+ text = message_data.get("text", "")
417
+ voice_preset = message_data.get("voice_preset", "v2/en_speaker_6")
418
+
419
+ if text.strip():
420
+ await self.play_tts_response(client_id, text, voice_preset)
421
+ else:
422
+ await self.send_message(client_id, {
423
+ "type": "tts_error",
424
+ "message": "Empty text provided for TTS",
425
+ "timestamp": datetime.now().isoformat()
426
+ })
427
+
428
+ elif message_type == "get_tts_voices":
429
+ # Client requesting available TTS voices
430
+ await self.send_message(client_id, {
431
+ "type": "tts_voices_list",
432
+ "voices": ["v2/en_speaker_6", "v2/en_speaker_9", "v2/en_speaker_3", "v2/en_speaker_1"],
433
+ "timestamp": datetime.now().isoformat()
434
+ })
435
+
436
  else:
437
  logger.warning(f"Unknown message type from {client_id}: {message_type}")
438