voxtral-studio

Sleeping

App Files Files Community

mehdilaalali commited on Apr 12

Commit

559f05c

verified ·

1 Parent(s): 66e6d77

fix: populate voices via lazy load, add direct mp3 audio curl bypass for YT blocks, add clone badge

Browse files

Files changed (1) hide show

app.py +72 -40

app.py CHANGED Viewed

@@ -1,10 +1,10 @@
 import os
 import base64
-import tempfile
 import gradio as gr
 from pathlib import Path
 import base64
 import os
 from mistralai.client import Mistral
 def list_user_voices():
@@ -118,34 +118,49 @@ def clone_voice(audio_path, url_input, voice_name, gender, languages_str):
     final_audio_path = audio_path
     try:
-        # If URL is provided, download it with yt-dlp
         if url_input.strip():
-            import yt_dlp
             base_out = tempfile.mktemp()
-            ydl_opts = {
-                'format': 'bestaudio/best',
-                'outtmpl': base_out + '.%(ext)s',
-                'quiet': True,
-                'postprocessors': [{
-                    'key': 'FFmpegExtractAudio',
-                    'preferredcodec': 'mp3',
-                    'preferredquality': '128',
-                }],
-                'postprocessor_args': [
-                    '-t', '60' # Limit to first 60 seconds to avoid exceeding API limits
-                ],
-            }
-            try:
-                with yt_dlp.YoutubeDL(ydl_opts) as ydl:
-                    info = ydl.extract_info(url_input.strip(), download=True)
-                    # after postprocessing, file has .mp3 extension
-                    final_audio_path = base_out + '.mp3'
-            except Exception as e:
-                err_msg = str(e)
-                if "Sign in to confirm" in err_msg or "bot" in err_msg.lower() or "youtube" in err_msg.lower():
-                    raise gr.Error("YouTube blocked the Hugging Face datacenter IP. Please try a TikTok/Twitter link instead, or download the MP3 manually and upload it above.")
-                else:
-                    raise gr.Error(f"Video download failed: {err_msg}")
         client = get_client()
         sample_b64 = base64.b64encode(Path(final_audio_path).read_bytes()).decode()
@@ -201,7 +216,7 @@ body, .gradio-container {
     z-index: 10;
 }
 .app-header h1 {
-    font-size: 3.8rem;
     font-weight: 800;
     letter-spacing: -1.5px;
     background: linear-gradient(135deg, #c084fc 0%, #ec4899 50%, #facc15 100%);
@@ -213,10 +228,21 @@ body, .gradio-container {
 }
 .app-header p {
     color: #94a3b8;
-    font-size: 1.15rem;
-    font-weight: 400;
     margin-top: 0;
 }
 /* Glass panel wrapper */
 div.tabs-container, .panel-box {
@@ -344,15 +370,19 @@ label span {
 footer { display: none !important; }
 """
-# Initialize voices at startup
-INITIAL_VOICES = get_voice_choices()
 with gr.Blocks(title="Voxtral Studio — Mistral AI Audio") as demo:
     gr.HTML("""
     <div class="app-header">
-        <h1>🎙️ Voxtral Studio</h1>
-        <p>Powered by Mistral AI · Speech-to-Text &amp; Text-to-Speech with Voice Cloning</p>
     </div>
     """)
@@ -408,9 +438,8 @@ with gr.Blocks(title="Voxtral Studio — Mistral AI Audio") as demo:
                     )
                     with gr.Row():
                         tts_voice_id = gr.Dropdown(
-                            label="Select a Mistral Voice",
-                            choices=INITIAL_VOICES,
-                            value=INITIAL_VOICES[0][1] if INITIAL_VOICES else None,
                             allow_custom_value=True,
                             scale=3,
                         )
@@ -466,8 +495,8 @@ with gr.Blocks(title="Voxtral Studio — Mistral AI Audio") as demo:
                         elem_classes=["audio-component"],
                     )
                     clone_url = gr.Textbox(
-                        label="OR: Media URL (YouTube, TikTok, MP3, etc.)",
-                        placeholder="https://www.youtube.com/watch?v=...",
                     )
                     clone_name = gr.Textbox(
                         label="Voice Name",
@@ -505,5 +534,8 @@ with gr.Blocks(title="Voxtral Studio — Mistral AI Audio") as demo:
     """)
 if __name__ == "__main__":
-    demo.launch(server_name="0.0.0.0", server_port=7860, css=css, ssr_mode=False)

 import os
 import base64
 import gradio as gr
 from pathlib import Path
 import base64
 import os
+import requests
 from mistralai.client import Mistral
 def list_user_voices():
     final_audio_path = audio_path
     try:
+        # If URL is provided, handle direct links or yt-dlp
         if url_input.strip():
+            url = url_input.strip()
             base_out = tempfile.mktemp()
+            # If it's a direct audio file link, bypass yt-dlp and download it directly
+            if url.lower().endswith(('.mp3', '.wav', '.flac', '.ogg', '.m4a')):
+                try:
+                    ext = url.split('.')[-1]
+                    final_audio_path = f"{base_out}.{ext}"
+                    with requests.get(url, stream=True, timeout=15) as r:
+                        r.raise_for_status()
+                        with open(final_audio_path, 'wb') as f:
+                            for chunk in r.iter_content(chunk_size=8192):
+                                f.write(chunk)
+                except Exception as e:
+                    return f"❌ Error downloading direct audio link: {str(e)}", gr.update()
+            # Otherwise use yt-dlp for TikTok, Twitter, YouTube (if not blocked), etc.
+            else:
+                import yt_dlp
+                ydl_opts = {
+                    'format': 'bestaudio/best',
+                    'outtmpl': base_out + '.%(ext)s',
+                    'quiet': True,
+                    'postprocessors': [{
+                        'key': 'FFmpegExtractAudio',
+                        'preferredcodec': 'mp3',
+                        'preferredquality': '128',
+                    }],
+                    'postprocessor_args': [
+                        '-t', '60' # Limit to first 60 seconds
+                    ],
+                }
+                try:
+                    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+                        info = ydl.extract_info(url, download=True)
+                        final_audio_path = base_out + '.mp3'
+                except Exception as e:
+                    err_msg = str(e)
+                    if "Sign in to confirm" in err_msg or "bot" in err_msg.lower() or "youtube" in err_msg.lower():
+                        raise gr.Error("YouTube blocked the Hugging Face Server. Please use a TikTok/Twitter link, OR paste a direct .MP3 URL, OR upload the file manually.")
+                    else:
+                        raise gr.Error(f"Video download failed: {err_msg}")
         client = get_client()
         sample_b64 = base64.b64encode(Path(final_audio_path).read_bytes()).decode()
     z-index: 10;
 }
 .app-header h1 {
+    font-size: 3.2rem;
     font-weight: 800;
     letter-spacing: -1.5px;
     background: linear-gradient(135deg, #c084fc 0%, #ec4899 50%, #facc15 100%);
 }
 .app-header p {
     color: #94a3b8;
+    font-size: 1.25rem;
+    font-weight: 500;
     margin-top: 0;
 }
+.highlight-badge {
+    background: linear-gradient(135deg, #f59e0b, #ef4444);
+    color: white;
+    padding: 2px 8px;
+    border-radius: 8px;
+    font-size: 0.8rem;
+    font-weight: 800;
+    vertical-align: top;
+    margin-left: 10px;
+    box-shadow: 0 0 10px rgba(239, 68, 68, 0.6);
+}
 /* Glass panel wrapper */
 div.tabs-container, .panel-box {
 footer { display: none !important; }
 """
+# Helper to initialize voices on ui load
+def init_voices_ui():
+    choices = get_voice_choices()
+    default_val = choices[0][1] if choices else None
+    return gr.update(choices=choices, value=default_val)
 with gr.Blocks(title="Voxtral Studio — Mistral AI Audio") as demo:
     gr.HTML("""
     <div class="app-header">
+        <h1>🎙️ Voxtral Studio <span class="highlight-badge">VOICE CLONING</span></h1>
+        <p>Powered by Mistral AI · STT & Elite Text-to-Speech + Instant Zero-Shot Cloning</p>
     </div>
     """)
                     )
                     with gr.Row():
                         tts_voice_id = gr.Dropdown(
+                            label="Select a Mistral Voice or Your Clones",
+                            choices=[], # Populated on load
                             allow_custom_value=True,
                             scale=3,
                         )
                         elem_classes=["audio-component"],
                     )
                     clone_url = gr.Textbox(
+                        label="OR: Media URL (TikTok, Twitter, or direct .MP3/.WAV link)",
+                        placeholder="https://...link_to_audio_or_video...",
                     )
                     clone_name = gr.Textbox(
                         label="Voice Name",
     """)
+    # Populate choices dynamically when the page loads for each user!
+    demo.load(fn=init_voices_ui, outputs=tts_voice_id)
 if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860, ssr_mode=False)