Spaces:

RSHVR
/

Command_RTC

Sleeping

App Files Files Community

RSHVR commited on Mar 30, 2025

Commit

2c564da

verified ·

1 Parent(s): 8d98b9d

Update tts.py

Browse files

Files changed (1) hide show

tts.py +82 -39

tts.py CHANGED Viewed

@@ -1,7 +1,11 @@
 import os
 import torch
 import torchaudio
 import spaces
 from tortoise.api import TextToSpeech
 from tortoise.utils.audio import load_audio
@@ -11,48 +15,87 @@ os.makedirs("outputs", exist_ok=True)
 # Create a global TTS model instance
 tts_model = None
-# Synchronous function with GPU decorator
-@spaces.GPU
-def _generate_speech_gpu(text, voice_preset="random", voice_file_path=None):
-    global tts_model
-    try:
-        # Initialize the model if not already initialized
         if tts_model is None:
-            print("Initializing Tortoise-TTS model...")
-            tts_model = TextToSpeech(use_deepspeed=torch.cuda.is_available())
-            print(f"Model initialized. Using device: {next(tts_model.autoregressive.parameters()).device}")
-        # Process voice sample if provided
-        voice_samples = None
-        if voice_file_path and os.path.exists(voice_file_path):
-            print(f"Loading voice from {voice_file_path}")
-            voice_samples, _ = load_audio(voice_file_path, 22050)
-            voice_samples = [voice_samples]
-            voice_preset = None
-        # Generate speech
-        print(f"Generating speech for text: {text[:50]}...")
-        output_filename = f"outputs/tts_output_{hash(text) % 10000}.wav"
-        gen = tts_model.tts_with_preset(
-            text,
-            voice_samples=voice_samples,
-            preset=voice_preset
-        )
-        # Save the generated audio
-        torchaudio.save(output_filename, gen.squeeze(0).cpu(), 24000)
-        print(f"Speech generated and saved to {output_filename}")
-        # Return the filename and audio data
-        return output_filename, (24000, gen.squeeze(0).cpu())
-    except Exception as e:
-        print(f"Error generating speech: {str(e)}")
-        raise
-# Async wrapper that calls the GPU function
 async def generate_speech(text, voice_preset="random", voice_file_path=None):
-    # Call the GPU-decorated function
-    return _generate_speech_gpu(text, voice_preset, voice_file_path)

+# tts.py
 import os
 import torch
 import torchaudio
 import spaces
+import numpy as np
+from typing import AsyncGenerator, Generator, Optional, Protocol, Tuple, Union
+from numpy.typing import NDArray
 from tortoise.api import TextToSpeech
 from tortoise.utils.audio import load_audio
 # Create a global TTS model instance
 tts_model = None
+# Define TTSOptions for compatibility with FastRTC
+class TortoiseOptions:
+    def __init__(self, voice_preset="random", voice_file_path=None):
+        self.voice_preset = voice_preset
+        self.voice_file_path = voice_file_path
+# The main Tortoise TTS wrapper class implementing FastRTC's TTSModel protocol
+class TortoiseTTSModel:
+    def __init__(self):
+        global tts_model
         if tts_model is None:
+            self._initialize_model()
+        self.tts_model = tts_model
+    @spaces.GPU
+    def _initialize_model(self):
+        global tts_model
+        print("Initializing Tortoise-TTS model...")
+        tts_model = TextToSpeech(use_deepspeed=torch.cuda.is_available())
+        print(f"Model initialized. Using device: {next(tts_model.autoregressive.parameters()).device}")
+    @spaces.GPU
+    def _generate_speech(self, text, options=None):
+        options = options or TortoiseOptions()
+        try:
+            # Process voice sample if provided
+            voice_samples = None
+            if options.voice_file_path and os.path.exists(options.voice_file_path):
+                print(f"Loading voice from {options.voice_file_path}")
+                voice_samples, _ = load_audio(options.voice_file_path, 22050)
+                voice_samples = [voice_samples]
+                voice_preset = None
+            else:
+                voice_preset = options.voice_preset
+            # Generate speech
+            print(f"Generating speech for text: {text[:50]}...")
+            gen = self.tts_model.tts_with_preset(
+                text,
+                voice_samples=voice_samples,
+                preset=voice_preset
+            )
+            # Return the audio data with sample rate
+            return 24000, gen.squeeze(0).cpu().numpy().astype(np.float32)
+        except Exception as e:
+            print(f"Error generating speech: {str(e)}")
+            raise
+    def tts(self, text: str, options: Optional[TortoiseOptions] = None) -> Tuple[int, NDArray[np.float32]]:
+        """Generate speech audio from text in a single call"""
+        return self._generate_speech(text, options)
+    async def stream_tts(self, text: str, options: Optional[TortoiseOptions] = None) -> AsyncGenerator[Tuple[int, NDArray[np.float32]], None]:
+        """Stream speech audio asynchronously in chunks"""
+        sample_rate, audio_array = self._generate_speech(text, options)
+        # Split audio into chunks for streaming
+        chunk_size = 4000  # Adjust chunk size as needed
+        for i in range(0, len(audio_array), chunk_size):
+            chunk = audio_array[i:i+chunk_size]
+            yield sample_rate, chunk
+    def stream_tts_sync(self, text: str, options: Optional[TortoiseOptions] = None) -> Generator[Tuple[int, NDArray[np.float32]], None, None]:
+        """Stream speech audio synchronously in chunks"""
+        sample_rate, audio_array = self._generate_speech(text, options)
+        # Split audio into chunks for streaming
+        chunk_size = 4000  # Adjust chunk size as needed
+        for i in range(0, len(audio_array), chunk_size):
+            chunk = audio_array[i:i+chunk_size]
+            yield sample_rate, chunk
+# Create a singleton instance for easy import
+tortoise_tts = TortoiseTTSModel()
+# Legacy function for backward compatibility
 async def generate_speech(text, voice_preset="random", voice_file_path=None):
+    options = TortoiseOptions(voice_preset, voice_file_path)
+    sample_rate, audio_array = tortoise_tts.tts(text, options)
+    return f"outputs/tts_output_{hash(text) % 10000}.wav", (sample_rate, torch.from_numpy(audio_array))