voxtral-studio

Sleeping

App Files Files Community

mehdilaalali commited on Apr 12

Commit

242da97

verified ·

1 Parent(s): cef5f3b

refactor: decouple API functions to core.py

Browse files

Files changed (1) hide show

app.py +26 -164

app.py CHANGED Viewed

@@ -2,180 +2,39 @@ import os
 import base64
 import tempfile
 import gradio as gr
 from pathlib import Path
-import requests
-from mistralai.client import Mistral
-def list_user_voices():
-    # Keep the textual list for debugging if needed, or remove it. Let's keep it but improve it.
-    try:
-        client = get_client()
-        result = client.audio.voices.list(limit=100, offset=0)
-        if result.total == 0:
-            return "No voices found in your account."
-        out = f"**Total Voices:** {result.total}\n\n"
-        for voice in result.items:
-            out += f"- **{voice.name}**\n  - ID: `{voice.id}`\n  - Languages: {', '.join(voice.languages) if hasattr(voice, 'languages') else 'unknown'}\n"
-        return out
-    except Exception as e:
-        return f"Error fetching voices: {str(e)}"
-def get_voice_choices():
-    try:
-        client = get_client()
-        res = client.audio.voices.list(limit=100, offset=0)
-        # Filter for Official Mistral Voices (Paul, Oliver, Jane, Marie) so we hide randomly cloned user voices
-        official_names = ("Paul", "Oliver", "Jane", "Marie")
-        official = []
-        for v in res.items:
-            if v.name.startswith(official_names) and " - " in v.name:
-                official.append((f"{v.name}", v.id))
-        return official
-    except:
-        return []
-# ─── Client ───────────────────────────────────────────────────────────────────
-def get_client():
-    api_key = os.environ.get("MISTRAL_API_KEY")
-    if not api_key:
-        raise gr.Error("MISTRAL_API_KEY secret is not set. Add it in Space Settings → Secrets.")
-    return Mistral(api_key=api_key)
-# ─── STT ──────────────────────────────────────────────────────────────────────
-def transcribe_audio(audio_path, language):
-    """Convert audio file → text using Voxtral Mini Transcribe."""
-    if audio_path is None:
         return "⚠️ Please record or upload an audio file first."
     try:
-        client = get_client()
-        lang_param = language if language != "Auto-detect" else None
-        with open(audio_path, "rb") as f:
-            kwargs = dict(
-                model="voxtral-mini-latest",
-                file={"content": f, "file_name": Path(audio_path).name},
-            )
-            if lang_param:
-                kwargs["language"] = lang_param
-            response = client.audio.transcriptions.complete(**kwargs)
-        return response.text
     except Exception as e:
         return f"❌ Error: {str(e)}"
-# ─── TTS ──────────────────────────────────────────────────────────────────────
-BUILTIN_VOICES = {
-    "Default (no voice clone)": None,
-}
-def synthesize_speech(text, voice_id_input, ref_audio_path, audio_format):
-    """Convert text → speech using Voxtral Mini TTS."""
-    if not text.strip():
-        return None, "⚠️ Please enter some text."
     try:
-        client = get_client()
         voice_id = voice_id_input.strip() if voice_id_input and voice_id_input.strip() else None
-        kwargs = dict(
-            model="voxtral-mini-tts-2603",
-            input=text,
-            response_format=audio_format,
-        )
-        if voice_id:
-            kwargs["voice_id"] = voice_id
-        # Add Reference Audio for Zero-shot tone/voice cloning
-        if ref_audio_path:
-            with open(ref_audio_path, "rb") as f:
-                ref_audio_b64 = base64.b64encode(f.read()).decode("utf-8")
-            kwargs["ref_audio"] = ref_audio_b64
-        if not voice_id and not ref_audio_path:
-            raise gr.Error("Mistral API requires a voice! Please either upload a short 'Reference Audio' clip to define the voice tone zero-shot, OR paste a valid Voice ID you cloned in the Voice Cloning tab. There are no built-in standard voices.")
-        response = client.audio.speech.complete(**kwargs)
-        audio_bytes = base64.b64decode(response.audio_data)
-        # Write to temp file
-        suffix = f".{audio_format}"
-        tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix)
-        tmp.write(audio_bytes)
-        tmp.close()
-        return tmp.name, f"✅ Generated {len(audio_bytes):,} bytes of {audio_format.upper()} audio."
     except Exception as e:
         return None, f"❌ Error: {str(e)}"
-# ─── Voice Cloning ────────────────────────────────────────────────────────────
-def clone_voice(audio_path, url_input, voice_name, gender, languages_str):
-    """Upload a sample audio or provide a URL to create a reusable cloned voice."""
-    if not audio_path and not url_input.strip():
-        return "⚠️ Please upload an audio clip or provide a media URL.", gr.update()
-    if not voice_name.strip():
-        return "⚠️ Please enter a name for the voice.", gr.update()
-    final_audio_path = audio_path
     try:
-        # If URL is provided, handle direct links or yt-dlp
-        if url_input.strip():
-            url = url_input.strip()
-            base_out = tempfile.mktemp()
-            # If it's a direct audio file link, bypass yt-dlp and download it directly
-            if url.lower().endswith(('.mp3', '.wav', '.flac', '.ogg', '.m4a')):
-                try:
-                    ext = url.split('.')[-1]
-                    final_audio_path = f"{base_out}.{ext}"
-                    with requests.get(url, stream=True, timeout=15) as r:
-                        r.raise_for_status()
-                        with open(final_audio_path, 'wb') as f:
-                            for chunk in r.iter_content(chunk_size=8192):
-                                f.write(chunk)
-                except Exception as e:
-                    return f"❌ Error downloading direct audio link: {str(e)}", gr.update()
-            # Otherwise use yt-dlp for TikTok, Twitter, YouTube (if not blocked), etc.
-            else:
-                import yt_dlp
-                ydl_opts = {
-                    'format': 'bestaudio/best',
-                    'outtmpl': base_out + '.%(ext)s',
-                    'quiet': True,
-                    'postprocessors': [{
-                        'key': 'FFmpegExtractAudio',
-                        'preferredcodec': 'mp3',
-                        'preferredquality': '128',
-                    }],
-                    'postprocessor_args': [
-                        '-t', '60' # Limit to first 60 seconds
-                    ],
-                }
-                try:
-                    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
-                        info = ydl.extract_info(url, download=True)
-                        final_audio_path = base_out + '.mp3'
-                except Exception as e:
-                    err_msg = str(e)
-                    if "Sign in to confirm" in err_msg or "bot" in err_msg.lower() or "youtube" in err_msg.lower():
-                        raise gr.Error("YouTube blocked the Hugging Face Server. Please use a TikTok/Twitter link, OR paste a direct .MP3 URL, OR upload the file manually.")
-                    else:
-                        raise gr.Error(f"Video download failed: {err_msg}")
-        client = get_client()
-        sample_b64 = base64.b64encode(Path(final_audio_path).read_bytes()).decode()
-        langs = [l.strip() for l in languages_str.split(",") if l.strip()] or ["en"]
-        voice = client.audio.voices.create(
-            name=voice_name.strip(),
-            sample_audio=sample_b64,
-            sample_filename=Path(final_audio_path).name,
-            languages=langs,
-            gender=gender.lower(),
-        )
-        # Clean up downloaded file
-        if url_input.strip() and os.path.exists(final_audio_path):
-            try: os.remove(final_audio_path)
-            except: pass
         # Build new choices specifically for this user session: Official Voices + Their new clone
         new_session_choices = get_voice_choices() + [(f"{voice.name} (Custom Session Clone)", voice.id)]
         return (
@@ -183,7 +42,10 @@ def clone_voice(audio_path, url_input, voice_name, gender, languages_str):
             gr.update(choices=new_session_choices, value=voice.id)
         )
     except Exception as e:
-        return f"❌ Error: {str(e)}", gr.update()
 # ─── UI ────��──────────────────────────────────────────────────────────────────
@@ -399,7 +261,7 @@ with gr.Blocks(title="Voxtral Studio — Mistral AI Audio", css=css) as demo:
                     )
             stt_btn.click(
-                fn=transcribe_audio,
                 inputs=[stt_audio, stt_language],
                 outputs=stt_output,
             )
@@ -451,7 +313,7 @@ with gr.Blocks(title="Voxtral Studio — Mistral AI Audio", css=css) as demo:
                     tts_status = gr.Markdown(elem_classes=["status-text"])
             tts_btn.click(
-                fn=synthesize_speech,
                 inputs=[tts_text, tts_voice_id, tts_ref_audio, tts_format],
                 outputs=[tts_audio_out, tts_status],
             )
@@ -504,7 +366,7 @@ with gr.Blocks(title="Voxtral Studio — Mistral AI Audio", css=css) as demo:
                     )
             clone_btn.click(
-                fn=clone_voice,
                 inputs=[clone_audio, clone_url, clone_name, clone_gender, clone_langs],
                 outputs=[clone_result, tts_voice_id],
             )

 import base64
 import tempfile
 import gradio as gr
+import base64
+import os
+import tempfile
+import gradio as gr
 from pathlib import Path
+from core import (
+    get_voice_choices,
+    transcribe_audio as core_transcribe,
+    synthesize_speech as core_synthesize,
+    clone_voice as core_clone
+)
+# ─── Gradio App Wrappers ──────────────────────────────────────────────────────
+def transcribe_handler(audio_path, language):
+    if not audio_path:
         return "⚠️ Please record or upload an audio file first."
     try:
+        return core_transcribe(audio_path, language)
     except Exception as e:
         return f"❌ Error: {str(e)}"
+def synthesize_handler(text, voice_id_input, ref_audio_path, audio_format):
     try:
         voice_id = voice_id_input.strip() if voice_id_input and voice_id_input.strip() else None
+        output_path, num_bytes = core_synthesize(text, voice_id, ref_audio_path, audio_format)
+        return output_path, f"✅ Generated {num_bytes:,} bytes of {audio_format.upper()} audio."
     except Exception as e:
         return None, f"❌ Error: {str(e)}"
+def clone_handler(audio_path, url_input, voice_name, gender, languages_str):
     try:
+        voice = core_clone(audio_path, url_input, voice_name, gender, languages_str)
         # Build new choices specifically for this user session: Official Voices + Their new clone
         new_session_choices = get_voice_choices() + [(f"{voice.name} (Custom Session Clone)", voice.id)]
         return (
             gr.update(choices=new_session_choices, value=voice.id)
         )
     except Exception as e:
+        err_msg = str(e)
+        if "Sign in to confirm" in err_msg or "bot" in err_msg.lower() or "youtube" in err_msg.lower():
+            return "❌ YouTube blocked the proxy crawler. Please use a TikTok/Twitter link, OR paste a direct .MP3 URL, OR upload the file manually.", gr.update()
+        return f"❌ Error: {err_msg}", gr.update()
 # ─── UI ────��──────────────────────────────────────────────────────────────────
                     )
             stt_btn.click(
+                fn=transcribe_handler,
                 inputs=[stt_audio, stt_language],
                 outputs=stt_output,
             )
                     tts_status = gr.Markdown(elem_classes=["status-text"])
             tts_btn.click(
+                fn=synthesize_handler,
                 inputs=[tts_text, tts_voice_id, tts_ref_audio, tts_format],
                 outputs=[tts_audio_out, tts_status],
             )
                     )
             clone_btn.click(
+                fn=clone_handler,
                 inputs=[clone_audio, clone_url, clone_name, clone_gender, clone_langs],
                 outputs=[clone_result, tts_voice_id],
             )