mehdilaalali commited on
Commit
a478940
·
verified ·
1 Parent(s): effb38e

feat: add zero-shot ref_audio directly to TTS tab

Browse files
Files changed (1) hide show
  1. app.py +16 -4
app.py CHANGED
@@ -39,7 +39,7 @@ BUILTIN_VOICES = {
39
  "Default (no voice clone)": None,
40
  }
41
 
42
- def synthesize_speech(text, voice_id_input, audio_format):
43
  """Convert text → speech using Voxtral Mini TTS."""
44
  if not text.strip():
45
  return None, "⚠️ Please enter some text."
@@ -54,6 +54,13 @@ def synthesize_speech(text, voice_id_input, audio_format):
54
  )
55
  if voice_id:
56
  kwargs["voice_id"] = voice_id
 
 
 
 
 
 
 
57
 
58
  response = client.audio.speech.complete(**kwargs)
59
  audio_bytes = base64.b64decode(response.audio_data)
@@ -269,8 +276,13 @@ with gr.Blocks(title="Voxtral Studio — Mistral AI Audio") as demo:
269
  value="Hello! Welcome to Voxtral Studio, powered by Mistral AI. This is a demonstration of high-quality neural text-to-speech synthesis.",
270
  )
271
  tts_voice_id = gr.Textbox(
272
- label="Voice ID (optional)",
273
- placeholder="Paste a voice ID from the Voice Cloning tab, or leave blank for default voice",
 
 
 
 
 
274
  )
275
  tts_format = gr.Dropdown(
276
  choices=["mp3", "wav", "flac", "opus"],
@@ -289,7 +301,7 @@ with gr.Blocks(title="Voxtral Studio — Mistral AI Audio") as demo:
289
 
290
  tts_btn.click(
291
  fn=synthesize_speech,
292
- inputs=[tts_text, tts_voice_id, tts_format],
293
  outputs=[tts_audio_out, tts_status],
294
  )
295
 
 
39
  "Default (no voice clone)": None,
40
  }
41
 
42
+ def synthesize_speech(text, voice_id_input, ref_audio_path, audio_format):
43
  """Convert text → speech using Voxtral Mini TTS."""
44
  if not text.strip():
45
  return None, "⚠️ Please enter some text."
 
54
  )
55
  if voice_id:
56
  kwargs["voice_id"] = voice_id
57
+
58
+ # Add Reference Audio for Zero-shot tone/voice cloning
59
+ if ref_audio_path:
60
+ with open(ref_audio_path, "rb") as f:
61
+ import base64
62
+ ref_audio_b64 = base64.b64encode(f.read()).decode("utf-8")
63
+ kwargs["ref_audio"] = ref_audio_b64
64
 
65
  response = client.audio.speech.complete(**kwargs)
66
  audio_bytes = base64.b64decode(response.audio_data)
 
276
  value="Hello! Welcome to Voxtral Studio, powered by Mistral AI. This is a demonstration of high-quality neural text-to-speech synthesis.",
277
  )
278
  tts_voice_id = gr.Textbox(
279
+ label="Saved Voice ID (Optional)",
280
+ placeholder="Paste a Voice ID from the Voice Cloning tab...",
281
+ )
282
+ tts_ref_audio = gr.Audio(
283
+ label="OR: Reference Audio (Set voice tone instantly)",
284
+ sources=["upload", "microphone"],
285
+ type="filepath",
286
  )
287
  tts_format = gr.Dropdown(
288
  choices=["mp3", "wav", "flac", "opus"],
 
301
 
302
  tts_btn.click(
303
  fn=synthesize_speech,
304
+ inputs=[tts_text, tts_voice_id, tts_ref_audio, tts_format],
305
  outputs=[tts_audio_out, tts_status],
306
  )
307