chmielvu commited on
Commit
94d7fba
·
verified ·
1 Parent(s): fafbc88

fix: add default speaker for non-cloning synthesis

Browse files
Files changed (1) hide show
  1. app.py +14 -9
app.py CHANGED
@@ -52,6 +52,7 @@ class Settings:
52
  # Model
53
  model_name: str = os.getenv("XTTS_MODEL_NAME", "tts_models/multilingual/multi-dataset/xtts_v2")
54
  default_language: str = os.getenv("XTTS_DEFAULT_LANGUAGE", "pl")
 
55
 
56
  # Generation params
57
  temperature: float = _env_float("XTTS_TEMPERATURE", 0.65)
@@ -263,21 +264,25 @@ def _synthesize(text: str, language: str, speaker_wav_bytes: Optional[bytes] = N
263
  with _infer_lock:
264
  tmp_path = None
265
  try:
266
- speaker_wav = None
267
  if speaker_wav_bytes:
268
- # Check speaker cache for pre-computed latents
269
- # (coqui-tts handles caching internally in >=0.27, but we cache the temp file path approach)
270
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
271
  tmp.write(speaker_wav_bytes)
272
  tmp.flush()
273
  tmp_path = tmp.name
274
- speaker_wav = tmp_path
275
 
276
- audio_np = tts.tts(
277
- text=text,
278
- language=language,
279
- speaker_wav=speaker_wav,
280
- )
 
 
 
 
 
 
 
281
  finally:
282
  if tmp_path:
283
  try:
 
52
  # Model
53
  model_name: str = os.getenv("XTTS_MODEL_NAME", "tts_models/multilingual/multi-dataset/xtts_v2")
54
  default_language: str = os.getenv("XTTS_DEFAULT_LANGUAGE", "pl")
55
+ default_speaker: str = os.getenv("XTTS_DEFAULT_SPEAKER", "Claribel Dervla") # Built-in XTTS speaker
56
 
57
  # Generation params
58
  temperature: float = _env_float("XTTS_TEMPERATURE", 0.65)
 
264
  with _infer_lock:
265
  tmp_path = None
266
  try:
 
267
  if speaker_wav_bytes:
268
+ # Voice cloning mode: use provided speaker WAV
 
269
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
270
  tmp.write(speaker_wav_bytes)
271
  tmp.flush()
272
  tmp_path = tmp.name
 
273
 
274
+ audio_np = tts.tts(
275
+ text=text,
276
+ language=language,
277
+ speaker_wav=tmp_path,
278
+ )
279
+ else:
280
+ # Default speaker mode: use built-in speaker
281
+ audio_np = tts.tts(
282
+ text=text,
283
+ language=language,
284
+ speaker=S.default_speaker,
285
+ )
286
  finally:
287
  if tmp_path:
288
  try: