Spaces:
Sleeping
Sleeping
feat: add zero-shot ref_audio directly to TTS tab
Browse files
app.py
CHANGED
|
@@ -39,7 +39,7 @@ BUILTIN_VOICES = {
|
|
| 39 |
"Default (no voice clone)": None,
|
| 40 |
}
|
| 41 |
|
| 42 |
-
def synthesize_speech(text, voice_id_input, audio_format):
|
| 43 |
"""Convert text → speech using Voxtral Mini TTS."""
|
| 44 |
if not text.strip():
|
| 45 |
return None, "⚠️ Please enter some text."
|
|
@@ -54,6 +54,13 @@ def synthesize_speech(text, voice_id_input, audio_format):
|
|
| 54 |
)
|
| 55 |
if voice_id:
|
| 56 |
kwargs["voice_id"] = voice_id
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
|
| 58 |
response = client.audio.speech.complete(**kwargs)
|
| 59 |
audio_bytes = base64.b64decode(response.audio_data)
|
|
@@ -269,8 +276,13 @@ with gr.Blocks(title="Voxtral Studio — Mistral AI Audio") as demo:
|
|
| 269 |
value="Hello! Welcome to Voxtral Studio, powered by Mistral AI. This is a demonstration of high-quality neural text-to-speech synthesis.",
|
| 270 |
)
|
| 271 |
tts_voice_id = gr.Textbox(
|
| 272 |
-
label="Voice ID (
|
| 273 |
-
placeholder="Paste a
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 274 |
)
|
| 275 |
tts_format = gr.Dropdown(
|
| 276 |
choices=["mp3", "wav", "flac", "opus"],
|
|
@@ -289,7 +301,7 @@ with gr.Blocks(title="Voxtral Studio — Mistral AI Audio") as demo:
|
|
| 289 |
|
| 290 |
tts_btn.click(
|
| 291 |
fn=synthesize_speech,
|
| 292 |
-
inputs=[tts_text, tts_voice_id, tts_format],
|
| 293 |
outputs=[tts_audio_out, tts_status],
|
| 294 |
)
|
| 295 |
|
|
|
|
| 39 |
"Default (no voice clone)": None,
|
| 40 |
}
|
| 41 |
|
| 42 |
+
def synthesize_speech(text, voice_id_input, ref_audio_path, audio_format):
|
| 43 |
"""Convert text → speech using Voxtral Mini TTS."""
|
| 44 |
if not text.strip():
|
| 45 |
return None, "⚠️ Please enter some text."
|
|
|
|
| 54 |
)
|
| 55 |
if voice_id:
|
| 56 |
kwargs["voice_id"] = voice_id
|
| 57 |
+
|
| 58 |
+
# Add Reference Audio for Zero-shot tone/voice cloning
|
| 59 |
+
if ref_audio_path:
|
| 60 |
+
with open(ref_audio_path, "rb") as f:
|
| 61 |
+
import base64
|
| 62 |
+
ref_audio_b64 = base64.b64encode(f.read()).decode("utf-8")
|
| 63 |
+
kwargs["ref_audio"] = ref_audio_b64
|
| 64 |
|
| 65 |
response = client.audio.speech.complete(**kwargs)
|
| 66 |
audio_bytes = base64.b64decode(response.audio_data)
|
|
|
|
| 276 |
value="Hello! Welcome to Voxtral Studio, powered by Mistral AI. This is a demonstration of high-quality neural text-to-speech synthesis.",
|
| 277 |
)
|
| 278 |
tts_voice_id = gr.Textbox(
|
| 279 |
+
label="Saved Voice ID (Optional)",
|
| 280 |
+
placeholder="Paste a Voice ID from the Voice Cloning tab...",
|
| 281 |
+
)
|
| 282 |
+
tts_ref_audio = gr.Audio(
|
| 283 |
+
label="OR: Reference Audio (Set voice tone instantly)",
|
| 284 |
+
sources=["upload", "microphone"],
|
| 285 |
+
type="filepath",
|
| 286 |
)
|
| 287 |
tts_format = gr.Dropdown(
|
| 288 |
choices=["mp3", "wav", "flac", "opus"],
|
|
|
|
| 301 |
|
| 302 |
tts_btn.click(
|
| 303 |
fn=synthesize_speech,
|
| 304 |
+
inputs=[tts_text, tts_voice_id, tts_ref_audio, tts_format],
|
| 305 |
outputs=[tts_audio_out, tts_status],
|
| 306 |
)
|
| 307 |
|