fix: add default speaker for non-cloning synthesis
Browse files
app.py
CHANGED
|
@@ -52,6 +52,7 @@ class Settings:
|
|
| 52 |
# Model
|
| 53 |
model_name: str = os.getenv("XTTS_MODEL_NAME", "tts_models/multilingual/multi-dataset/xtts_v2")
|
| 54 |
default_language: str = os.getenv("XTTS_DEFAULT_LANGUAGE", "pl")
|
|
|
|
| 55 |
|
| 56 |
# Generation params
|
| 57 |
temperature: float = _env_float("XTTS_TEMPERATURE", 0.65)
|
|
@@ -263,21 +264,25 @@ def _synthesize(text: str, language: str, speaker_wav_bytes: Optional[bytes] = N
|
|
| 263 |
with _infer_lock:
|
| 264 |
tmp_path = None
|
| 265 |
try:
|
| 266 |
-
speaker_wav = None
|
| 267 |
if speaker_wav_bytes:
|
| 268 |
-
#
|
| 269 |
-
# (coqui-tts handles caching internally in >=0.27, but we cache the temp file path approach)
|
| 270 |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
|
| 271 |
tmp.write(speaker_wav_bytes)
|
| 272 |
tmp.flush()
|
| 273 |
tmp_path = tmp.name
|
| 274 |
-
speaker_wav = tmp_path
|
| 275 |
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 281 |
finally:
|
| 282 |
if tmp_path:
|
| 283 |
try:
|
|
|
|
| 52 |
# Model
|
| 53 |
model_name: str = os.getenv("XTTS_MODEL_NAME", "tts_models/multilingual/multi-dataset/xtts_v2")
|
| 54 |
default_language: str = os.getenv("XTTS_DEFAULT_LANGUAGE", "pl")
|
| 55 |
+
default_speaker: str = os.getenv("XTTS_DEFAULT_SPEAKER", "Claribel Dervla") # Built-in XTTS speaker
|
| 56 |
|
| 57 |
# Generation params
|
| 58 |
temperature: float = _env_float("XTTS_TEMPERATURE", 0.65)
|
|
|
|
| 264 |
with _infer_lock:
|
| 265 |
tmp_path = None
|
| 266 |
try:
|
|
|
|
| 267 |
if speaker_wav_bytes:
|
| 268 |
+
# Voice cloning mode: use provided speaker WAV
|
|
|
|
| 269 |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
|
| 270 |
tmp.write(speaker_wav_bytes)
|
| 271 |
tmp.flush()
|
| 272 |
tmp_path = tmp.name
|
|
|
|
| 273 |
|
| 274 |
+
audio_np = tts.tts(
|
| 275 |
+
text=text,
|
| 276 |
+
language=language,
|
| 277 |
+
speaker_wav=tmp_path,
|
| 278 |
+
)
|
| 279 |
+
else:
|
| 280 |
+
# Default speaker mode: use built-in speaker
|
| 281 |
+
audio_np = tts.tts(
|
| 282 |
+
text=text,
|
| 283 |
+
language=language,
|
| 284 |
+
speaker=S.default_speaker,
|
| 285 |
+
)
|
| 286 |
finally:
|
| 287 |
if tmp_path:
|
| 288 |
try:
|