Spaces:
Sleeping
Sleeping
Michael Hu
commited on
Commit
·
22bd0b9
1
Parent(s):
5a72681
use kokoro fastAPI server to generate voice
Browse files- utils/tts.py +34 -3
utils/tts.py
CHANGED
|
@@ -9,6 +9,7 @@ logger = logging.getLogger(__name__)
|
|
| 9 |
|
| 10 |
# Flag to track TTS engine availability
|
| 11 |
KOKORO_AVAILABLE = False
|
|
|
|
| 12 |
DIA_AVAILABLE = False
|
| 13 |
|
| 14 |
# Try to import Kokoro first
|
|
@@ -25,7 +26,9 @@ except AttributeError as e:
|
|
| 25 |
result = client.predict(
|
| 26 |
api_name="/lambda"
|
| 27 |
)
|
| 28 |
-
|
|
|
|
|
|
|
| 29 |
else:
|
| 30 |
# Re-raise if it's a different error
|
| 31 |
logger.error(f"Kokoro import failed with unexpected error: {str(e)}")
|
|
@@ -97,14 +100,32 @@ class TTSEngine:
|
|
| 97 |
logger.error(f"Failed to initialize Kokoro pipeline: {str(kokoro_err)}")
|
| 98 |
logger.error(f"Error type: {type(kokoro_err).__name__}")
|
| 99 |
logger.info("Will try to fall back to Dia TTS engine")
|
| 100 |
-
|
| 101 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
# Try Dia if Kokoro is not available or failed to initialize
|
| 103 |
if self.engine_type is None and DIA_AVAILABLE:
|
| 104 |
logger.info("Using Dia as fallback TTS engine")
|
| 105 |
# For Dia, we don't need to initialize anything here
|
| 106 |
# The model will be lazy-loaded when needed
|
| 107 |
self.pipeline = None
|
|
|
|
| 108 |
self.engine_type = "dia"
|
| 109 |
logger.info("TTS engine initialized with Dia (lazy loading)")
|
| 110 |
|
|
@@ -113,6 +134,7 @@ class TTSEngine:
|
|
| 113 |
logger.warning("Using dummy TTS implementation as no TTS engines are available")
|
| 114 |
logger.warning("Check logs above for specific errors that prevented Kokoro or Dia initialization")
|
| 115 |
self.pipeline = None
|
|
|
|
| 116 |
self.engine_type = "dummy"
|
| 117 |
|
| 118 |
def generate_speech(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> str:
|
|
@@ -145,6 +167,15 @@ class TTSEngine:
|
|
| 145 |
logger.info(f"Saving Kokoro audio to {output_path}")
|
| 146 |
sf.write(output_path, audio, 24000)
|
| 147 |
break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
elif self.engine_type == "dia":
|
| 149 |
# Use Dia for TTS generation
|
| 150 |
try:
|
|
|
|
| 9 |
|
| 10 |
# Flag to track TTS engine availability
|
| 11 |
KOKORO_AVAILABLE = False
|
| 12 |
+
KOKORO_SPACE_AVAILABLE = False
|
| 13 |
DIA_AVAILABLE = False
|
| 14 |
|
| 15 |
# Try to import Kokoro first
|
|
|
|
| 26 |
result = client.predict(
|
| 27 |
api_name="/lambda"
|
| 28 |
)
|
| 29 |
+
logger.debug(f"result get back from Kokora FastAPI server: {result}")
|
| 30 |
+
if result:
|
| 31 |
+
KOKORO_SPACE_AVAILABLE = True
|
| 32 |
else:
|
| 33 |
# Re-raise if it's a different error
|
| 34 |
logger.error(f"Kokoro import failed with unexpected error: {str(e)}")
|
|
|
|
| 100 |
logger.error(f"Failed to initialize Kokoro pipeline: {str(kokoro_err)}")
|
| 101 |
logger.error(f"Error type: {type(kokoro_err).__name__}")
|
| 102 |
logger.info("Will try to fall back to Dia TTS engine")
|
| 103 |
+
|
| 104 |
+
if KOKORO_SPACE_AVAILABLE:
|
| 105 |
+
logger.info(f"Using Kokoro FastAPI server as primary TTS engine with language code: {lang_code}")
|
| 106 |
+
try:
|
| 107 |
+
self.client = Client("Remsky/Kokoro-TTS-Zero")
|
| 108 |
+
self.engine_type = "kokoro_space"
|
| 109 |
+
logger.info("TTS engine successfully initialized with Kokoro FastAPI server")
|
| 110 |
+
result = client.predict(
|
| 111 |
+
text="The studio was filled with the rich odour of roses, and when the light",
|
| 112 |
+
voice_names=None,
|
| 113 |
+
speed=1,
|
| 114 |
+
api_name="/generate_speech_from_ui"
|
| 115 |
+
)
|
| 116 |
+
logger.info(result)
|
| 117 |
+
except Exception as kokoro_err:
|
| 118 |
+
logger.error(f"Failed to initialize Kokoro pipeline: {str(kokoro_err)}")
|
| 119 |
+
logger.error(f"Error type: {type(kokoro_err).__name__}")
|
| 120 |
+
logger.info("Will try to fall back to Dia TTS engine")
|
| 121 |
+
|
| 122 |
# Try Dia if Kokoro is not available or failed to initialize
|
| 123 |
if self.engine_type is None and DIA_AVAILABLE:
|
| 124 |
logger.info("Using Dia as fallback TTS engine")
|
| 125 |
# For Dia, we don't need to initialize anything here
|
| 126 |
# The model will be lazy-loaded when needed
|
| 127 |
self.pipeline = None
|
| 128 |
+
self.client = None
|
| 129 |
self.engine_type = "dia"
|
| 130 |
logger.info("TTS engine initialized with Dia (lazy loading)")
|
| 131 |
|
|
|
|
| 134 |
logger.warning("Using dummy TTS implementation as no TTS engines are available")
|
| 135 |
logger.warning("Check logs above for specific errors that prevented Kokoro or Dia initialization")
|
| 136 |
self.pipeline = None
|
| 137 |
+
self.client = None
|
| 138 |
self.engine_type = "dummy"
|
| 139 |
|
| 140 |
def generate_speech(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> str:
|
|
|
|
| 167 |
logger.info(f"Saving Kokoro audio to {output_path}")
|
| 168 |
sf.write(output_path, audio, 24000)
|
| 169 |
break
|
| 170 |
+
elif self.engine_type == "kokoro_space":
|
| 171 |
+
# Use Kokoro FastAPI server for TTS generation
|
| 172 |
+
logger.info("Generating speech using Kokoro FastAPI server")
|
| 173 |
+
result = self.client.predict(
|
| 174 |
+
text=text,
|
| 175 |
+
voice_names=None,
|
| 176 |
+
speed=speed,
|
| 177 |
+
api_name="/generate_speech_from_ui"
|
| 178 |
+
)
|
| 179 |
elif self.engine_type == "dia":
|
| 180 |
# Use Dia for TTS generation
|
| 181 |
try:
|