voxtral-studio

Sleeping

App Files Files Community

mehdilaalali commited on Apr 12

Commit

3cda682

verified ·

1 Parent(s): f83030e

feat: add ability to clone voice natively from URLs (YouTube, TikTok) using yt-dlp

Browse files

Files changed (1) hide show

app.py +43 -8

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ import tempfile
 import gradio as gr
 from pathlib import Path
 import base64
 from mistralai.client import Mistral
 def list_user_voices():
@@ -107,23 +108,53 @@ def synthesize_speech(text, voice_id_input, ref_audio_path, audio_format):
 # ─── Voice Cloning ────────────────────────────────────────────────────────────
-def clone_voice(audio_path, voice_name, gender, languages_str):
-    """Upload a sample audio to create a reusable cloned voice."""
-    if audio_path is None:
-        return "⚠️ Please upload a sample audio clip."
     if not voice_name.strip():
-        return "⚠️ Please enter a name for the voice."
     try:
         client = get_client()
-        sample_b64 = base64.b64encode(Path(audio_path).read_bytes()).decode()
         langs = [l.strip() for l in languages_str.split(",") if l.strip()] or ["en"]
         voice = client.audio.voices.create(
             name=voice_name.strip(),
             sample_audio=sample_b64,
-            sample_filename=Path(audio_path).name,
             languages=langs,
             gender=gender.lower(),
         )
         # Build new choices specifically for this user session: Official Voices + Their new clone
         new_session_choices = get_voice_choices() + [(f"{voice.name} (Custom Session Clone)", voice.id)]
         return (
@@ -373,6 +404,10 @@ with gr.Blocks(title="Voxtral Studio — Mistral AI Audio") as demo:
                         type="filepath",
                         elem_classes=["audio-component"],
                     )
                     clone_name = gr.Textbox(
                         label="Voice Name",
                         placeholder="e.g. my-assistant-voice",
@@ -397,7 +432,7 @@ with gr.Blocks(title="Voxtral Studio — Mistral AI Audio") as demo:
             clone_btn.click(
                 fn=clone_voice,
-                inputs=[clone_audio, clone_name, clone_gender, clone_langs],
                 outputs=[clone_result, tts_voice_id],
             )

 import gradio as gr
 from pathlib import Path
 import base64
+import os
 from mistralai.client import Mistral
 def list_user_voices():
 # ─── Voice Cloning ────────────────────────────────────────────────────────────
+def clone_voice(audio_path, url_input, voice_name, gender, languages_str):
+    """Upload a sample audio or provide a URL to create a reusable cloned voice."""
+    if not audio_path and not url_input.strip():
+        return "⚠️ Please upload an audio clip or provide a media URL.", gr.update()
     if not voice_name.strip():
+        return "⚠️ Please enter a name for the voice.", gr.update()
+    final_audio_path = audio_path
     try:
+        # If URL is provided, download it with yt-dlp
+        if url_input.strip():
+            import yt_dlp
+            base_out = tempfile.mktemp()
+            ydl_opts = {
+                'format': 'bestaudio/best',
+                'outtmpl': base_out + '.%(ext)s',
+                'quiet': True,
+                'postprocessors': [{
+                    'key': 'FFmpegExtractAudio',
+                    'preferredcodec': 'mp3',
+                    'preferredquality': '128',
+                }],
+                'postprocessor_args': [
+                    '-t', '60' # Limit to first 60 seconds to avoid exceeding API limits
+                ],
+            }
+            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+                info = ydl.extract_info(url_input.strip(), download=True)
+                # after postprocessing, file has .mp3 extension
+                final_audio_path = base_out + '.mp3'
         client = get_client()
+        sample_b64 = base64.b64encode(Path(final_audio_path).read_bytes()).decode()
         langs = [l.strip() for l in languages_str.split(",") if l.strip()] or ["en"]
         voice = client.audio.voices.create(
             name=voice_name.strip(),
             sample_audio=sample_b64,
+            sample_filename=Path(final_audio_path).name,
             languages=langs,
             gender=gender.lower(),
         )
+        # Clean up downloaded file
+        if url_input.strip() and os.path.exists(final_audio_path):
+            try: os.remove(final_audio_path)
+            except: pass
         # Build new choices specifically for this user session: Official Voices + Their new clone
         new_session_choices = get_voice_choices() + [(f"{voice.name} (Custom Session Clone)", voice.id)]
         return (
                         type="filepath",
                         elem_classes=["audio-component"],
                     )
+                    clone_url = gr.Textbox(
+                        label="OR: Media URL (YouTube, TikTok, MP3, etc.)",
+                        placeholder="https://www.youtube.com/watch?v=...",
+                    )
                     clone_name = gr.Textbox(
                         label="Voice Name",
                         placeholder="e.g. my-assistant-voice",
             clone_btn.click(
                 fn=clone_voice,
+                inputs=[clone_audio, clone_url, clone_name, clone_gender, clone_langs],
                 outputs=[clone_result, tts_voice_id],
             )