voxtral-studio

Sleeping

App Files Files Community

mehdilaalali commited on Apr 12

Commit

a478940

verified ·

1 Parent(s): effb38e

feat: add zero-shot ref_audio directly to TTS tab

Browse files

Files changed (1) hide show

app.py +16 -4

app.py CHANGED Viewed

@@ -39,7 +39,7 @@ BUILTIN_VOICES = {
     "Default (no voice clone)": None,
 }
-def synthesize_speech(text, voice_id_input, audio_format):
     """Convert text → speech using Voxtral Mini TTS."""
     if not text.strip():
         return None, "⚠️ Please enter some text."
@@ -54,6 +54,13 @@ def synthesize_speech(text, voice_id_input, audio_format):
         )
         if voice_id:
             kwargs["voice_id"] = voice_id
         response = client.audio.speech.complete(**kwargs)
         audio_bytes = base64.b64decode(response.audio_data)
@@ -269,8 +276,13 @@ with gr.Blocks(title="Voxtral Studio — Mistral AI Audio") as demo:
                         value="Hello! Welcome to Voxtral Studio, powered by Mistral AI. This is a demonstration of high-quality neural text-to-speech synthesis.",
                     )
                     tts_voice_id = gr.Textbox(
-                        label="Voice ID (optional)",
-                        placeholder="Paste a voice ID from the Voice Cloning tab, or leave blank for default voice",
                     )
                     tts_format = gr.Dropdown(
                         choices=["mp3", "wav", "flac", "opus"],
@@ -289,7 +301,7 @@ with gr.Blocks(title="Voxtral Studio — Mistral AI Audio") as demo:
             tts_btn.click(
                 fn=synthesize_speech,
-                inputs=[tts_text, tts_voice_id, tts_format],
                 outputs=[tts_audio_out, tts_status],
             )

     "Default (no voice clone)": None,
 }
+def synthesize_speech(text, voice_id_input, ref_audio_path, audio_format):
     """Convert text → speech using Voxtral Mini TTS."""
     if not text.strip():
         return None, "⚠️ Please enter some text."
         )
         if voice_id:
             kwargs["voice_id"] = voice_id
+        # Add Reference Audio for Zero-shot tone/voice cloning
+        if ref_audio_path:
+            with open(ref_audio_path, "rb") as f:
+                import base64
+                ref_audio_b64 = base64.b64encode(f.read()).decode("utf-8")
+            kwargs["ref_audio"] = ref_audio_b64
         response = client.audio.speech.complete(**kwargs)
         audio_bytes = base64.b64decode(response.audio_data)
                         value="Hello! Welcome to Voxtral Studio, powered by Mistral AI. This is a demonstration of high-quality neural text-to-speech synthesis.",
                     )
                     tts_voice_id = gr.Textbox(
+                        label="Saved Voice ID (Optional)",
+                        placeholder="Paste a Voice ID from the Voice Cloning tab...",
+                    )
+                    tts_ref_audio = gr.Audio(
+                        label="OR: Reference Audio (Set voice tone instantly)",
+                        sources=["upload", "microphone"],
+                        type="filepath",
                     )
                     tts_format = gr.Dropdown(
                         choices=["mp3", "wav", "flac", "opus"],
             tts_btn.click(
                 fn=synthesize_speech,
+                inputs=[tts_text, tts_voice_id, tts_ref_audio, tts_format],
                 outputs=[tts_audio_out, tts_status],
             )