import os import base64 import tempfile import gradio as gr from pathlib import Path import requests from mistralai.client import Mistral def list_user_voices(): # Keep the textual list for debugging if needed, or remove it. Let's keep it but improve it. try: client = get_client() result = client.audio.voices.list(limit=100, offset=0) if result.total == 0: return "No voices found in your account." out = f"**Total Voices:** {result.total}\n\n" for voice in result.items: out += f"- **{voice.name}**\n - ID: `{voice.id}`\n - Languages: {', '.join(voice.languages) if hasattr(voice, 'languages') else 'unknown'}\n" return out except Exception as e: return f"Error fetching voices: {str(e)}" def get_voice_choices(): try: client = get_client() res = client.audio.voices.list(limit=100, offset=0) # Filter for Official Mistral Voices (Paul, Oliver, Jane, Marie) so we hide randomly cloned user voices official_names = ("Paul", "Oliver", "Jane", "Marie") official = [] for v in res.items: if v.name.startswith(official_names) and " - " in v.name: official.append((f"{v.name}", v.id)) return official except: return [] # ─── Client ─────────────────────────────────────────────────────────────────── def get_client(): api_key = os.environ.get("MISTRAL_API_KEY") if not api_key: raise gr.Error("MISTRAL_API_KEY secret is not set. Add it in Space Settings → Secrets.") return Mistral(api_key=api_key) # ─── STT ────────────────────────────────────────────────────────────────────── def transcribe_audio(audio_path, language): """Convert audio file → text using Voxtral Mini Transcribe.""" if audio_path is None: return "⚠️ Please record or upload an audio file first." try: client = get_client() lang_param = language if language != "Auto-detect" else None with open(audio_path, "rb") as f: kwargs = dict( model="voxtral-mini-latest", file={"content": f, "file_name": Path(audio_path).name}, ) if lang_param: kwargs["language"] = lang_param response = client.audio.transcriptions.complete(**kwargs) return response.text except Exception as e: return f"❌ Error: {str(e)}" # ─── TTS ────────────────────────────────────────────────────────────────────── BUILTIN_VOICES = { "Default (no voice clone)": None, } def synthesize_speech(text, voice_id_input, ref_audio_path, audio_format): """Convert text → speech using Voxtral Mini TTS.""" if not text.strip(): return None, "⚠️ Please enter some text." try: client = get_client() voice_id = voice_id_input.strip() if voice_id_input and voice_id_input.strip() else None kwargs = dict( model="voxtral-mini-tts-2603", input=text, response_format=audio_format, ) if voice_id: kwargs["voice_id"] = voice_id # Add Reference Audio for Zero-shot tone/voice cloning if ref_audio_path: with open(ref_audio_path, "rb") as f: ref_audio_b64 = base64.b64encode(f.read()).decode("utf-8") kwargs["ref_audio"] = ref_audio_b64 if not voice_id and not ref_audio_path: raise gr.Error("Mistral API requires a voice! Please either upload a short 'Reference Audio' clip to define the voice tone zero-shot, OR paste a valid Voice ID you cloned in the Voice Cloning tab. There are no built-in standard voices.") response = client.audio.speech.complete(**kwargs) audio_bytes = base64.b64decode(response.audio_data) # Write to temp file suffix = f".{audio_format}" tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix) tmp.write(audio_bytes) tmp.close() return tmp.name, f"✅ Generated {len(audio_bytes):,} bytes of {audio_format.upper()} audio." except Exception as e: return None, f"❌ Error: {str(e)}" # ─── Voice Cloning ──────────────────────────────────────────────────────────── def clone_voice(audio_path, url_input, voice_name, gender, languages_str): """Upload a sample audio or provide a URL to create a reusable cloned voice.""" if not audio_path and not url_input.strip(): return "⚠️ Please upload an audio clip or provide a media URL.", gr.update() if not voice_name.strip(): return "⚠️ Please enter a name for the voice.", gr.update() final_audio_path = audio_path try: # If URL is provided, handle direct links or yt-dlp if url_input.strip(): url = url_input.strip() base_out = tempfile.mktemp() # If it's a direct audio file link, bypass yt-dlp and download it directly if url.lower().endswith(('.mp3', '.wav', '.flac', '.ogg', '.m4a')): try: ext = url.split('.')[-1] final_audio_path = f"{base_out}.{ext}" with requests.get(url, stream=True, timeout=15) as r: r.raise_for_status() with open(final_audio_path, 'wb') as f: for chunk in r.iter_content(chunk_size=8192): f.write(chunk) except Exception as e: return f"❌ Error downloading direct audio link: {str(e)}", gr.update() # Otherwise use yt-dlp for TikTok, Twitter, YouTube (if not blocked), etc. else: import yt_dlp ydl_opts = { 'format': 'bestaudio/best', 'outtmpl': base_out + '.%(ext)s', 'quiet': True, 'postprocessors': [{ 'key': 'FFmpegExtractAudio', 'preferredcodec': 'mp3', 'preferredquality': '128', }], 'postprocessor_args': [ '-t', '60' # Limit to first 60 seconds ], } try: with yt_dlp.YoutubeDL(ydl_opts) as ydl: info = ydl.extract_info(url, download=True) final_audio_path = base_out + '.mp3' except Exception as e: err_msg = str(e) if "Sign in to confirm" in err_msg or "bot" in err_msg.lower() or "youtube" in err_msg.lower(): raise gr.Error("YouTube blocked the Hugging Face Server. Please use a TikTok/Twitter link, OR paste a direct .MP3 URL, OR upload the file manually.") else: raise gr.Error(f"Video download failed: {err_msg}") client = get_client() sample_b64 = base64.b64encode(Path(final_audio_path).read_bytes()).decode() langs = [l.strip() for l in languages_str.split(",") if l.strip()] or ["en"] voice = client.audio.voices.create( name=voice_name.strip(), sample_audio=sample_b64, sample_filename=Path(final_audio_path).name, languages=langs, gender=gender.lower(), ) # Clean up downloaded file if url_input.strip() and os.path.exists(final_audio_path): try: os.remove(final_audio_path) except: pass # Build new choices specifically for this user session: Official Voices + Their new clone new_session_choices = get_voice_choices() + [(f"{voice.name} (Custom Session Clone)", voice.id)] return ( f"✅ Voice created!\n\n**Voice ID:** `{voice.id}`\n**Name:** {voice.name}\n**Languages:** {', '.join(voice.languages)}\n\nThis voice has been automatically selected in the Text-to-Speech tab!", gr.update(choices=new_session_choices, value=voice.id) ) except Exception as e: return f"❌ Error: {str(e)}", gr.update() # ─── UI ─────────────────────────────────────────────────────────────────────── LANGUAGES = [ "Auto-detect", "en", "fr", "es", "de", "it", "pt", "zh", "ja", "ko", "ar", "ru", "hi", "nl" ] css = """ @import url('https://fonts.googleapis.com/css2?family=Outfit:wght@300;400;500;600;700;800&display=swap'); * { font-family: 'Outfit', sans-serif; } body, .gradio-container { background: radial-gradient(circle at 10% 20%, #150f24 0%, #07040d 100%) !important; min-height: 100vh; } .gradio-container { max-width: 1050px !important; margin: 0 auto !important; } /* App Header */ .app-header { text-align: center; padding: 3.5rem 1rem 1.5rem; position: relative; z-index: 10; } .app-header h1 { font-size: 3.2rem; font-weight: 800; letter-spacing: -1.5px; background: linear-gradient(135deg, #c084fc 0%, #ec4899 50%, #facc15 100%); -webkit-background-clip: text; -webkit-text-fill-color: transparent; background-clip: text; margin-bottom: 0.5rem; animation: glow-pulse 3s infinite alternate; } .app-header p { color: #94a3b8; font-size: 1.25rem; font-weight: 500; margin-top: 0; } .highlight-badge { background: linear-gradient(135deg, #f59e0b, #ef4444); color: white; padding: 2px 8px; border-radius: 8px; font-size: 0.8rem; font-weight: 800; vertical-align: top; margin-left: 10px; box-shadow: 0 0 10px rgba(239, 68, 68, 0.6); } /* Glass panel wrapper */ div.tabs-container, .panel-box { background: rgba(255, 255, 255, 0.02) !important; border: 1px solid rgba(255, 255, 255, 0.05) !important; border-radius: 20px !important; box-shadow: 0 10px 40px 0 rgba(0, 0, 0, 0.4) !important; backdrop-filter: blur(15px) !important; -webkit-backdrop-filter: blur(15px) !important; overflow: hidden; } /* Tabs */ .tab-nav { border-bottom: 1px solid rgba(255,255,255,0.05) !important; padding: 10px 10px 0 10px !important; } .tab-nav button { background: transparent !important; border: none !important; border-bottom: 3px solid transparent !important; color: #64748b !important; border-radius: 0 !important; margin: 0 !important; padding: 1rem 2rem !important; font-weight: 600 !important; font-size: 1.05rem !important; transition: all 0.3s ease !important; box-shadow: none !important; } .tab-nav button.selected, .tab-nav button:hover { color: #f8fafc !important; border-bottom: 3px solid #ec4899 !important; box-shadow: 0 20px 20px -20px rgba(236,72,153,0.3) !important; background: linear-gradient(0deg, rgba(236,72,153,0.1) 0%, transparent 100%) !important; } /* Inputs & Textareas */ textarea, input[type="text"], .dropdown-menu { background: rgba(0,0,0,0.25) !important; border: 1px solid rgba(255,255,255,0.08) !important; border-radius: 14px !important; color: #f8fafc !important; font-size: 1.05rem !important; transition: all 0.2s ease !important; padding: 0.75rem !important; } textarea:focus, input[type="text"]:focus { border-color: #ec4899 !important; box-shadow: 0 0 0 3px rgba(236,72,153,0.2) !important; background: rgba(0,0,0,0.4) !important; } /* Override Gradio layout borders */ div.form { border: none !important; box-shadow: none !important; background: transparent !important; } /* Cool gradient buttons */ button.primary { background: linear-gradient(135deg, #a78bfa 0%, #ec4899 100%) !important; border: none !important; color: white !important; border-radius: 14px !important; font-weight: 700 !important; font-size: 1.15rem !important; padding: 0.9rem !important; letter-spacing: 0.5px !important; box-shadow: 0 4px 15px rgba(236,72,153,0.3) !important; transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1) !important; } button.primary:hover { transform: translateY(-3px) !important; box-shadow: 0 8px 25px rgba(236,72,153,0.5) !important; } /* Secondary Button */ button.secondary { background: rgba(255,255,255,0.05) !important; border: 1px solid rgba(255,255,255,0.1) !important; border-radius: 14px !important; color: #e2e8f0 !important; transition: all 0.2s ease !important; font-weight: 600 !important; } button.secondary:hover { background: rgba(255,255,255,0.15) !important; border-color: rgba(255,255,255,0.3) !important; } /* Status text box */ .status-text { background: rgba(0,0,0,0.4); padding: 1.5rem; border-radius: 16px; border-left: 5px solid #a78bfa; color: #e2e8f0; font-size: 1rem; line-height: 1.6; } /* Highlight labels */ label span { color: #cbd5e1 !important; font-weight: 500 !important; letter-spacing: 0.2px !important; } /* Clean audio components */ .audio-component { border-radius: 16px !important; overflow: hidden !important; border: 1px solid rgba(255,255,255,0.05) !important; } /* Global Animations */ @keyframes glow-pulse { 0% { filter: drop-shadow(0 0 15px rgba(167,139,250,0.3)); } 100% { filter: drop-shadow(0 0 30px rgba(236,72,153,0.6)); } } /* Footer Hide */ footer { display: none !important; } """ INITIAL_VOICES = get_voice_choices() with gr.Blocks(title="Voxtral Studio — Mistral AI Audio", css=css) as demo: gr.HTML("""

🎙️ Voxtral Studio VOICE CLONING

Powered by Mistral AI · STT & Elite Text-to-Speech + Instant Zero-Shot Cloning

""") with gr.Tabs(): # ── TAB 1: Speech to Text ────────────────────────────────────────── with gr.TabItem("🎤 Speech → Text"): gr.Markdown(""" **Upload or record audio** and Voxtral Mini will transcribe it with high accuracy. Supports 13 languages, handles noise, and can detect the language automatically. """) with gr.Row(): with gr.Column(scale=1): stt_audio = gr.Audio( label="Audio Input", sources=["microphone", "upload"], type="filepath", elem_classes=["audio-component"], ) stt_language = gr.Dropdown( choices=LANGUAGES, value="Auto-detect", label="Language", ) stt_btn = gr.Button("✨ Transcribe", variant="primary") with gr.Column(scale=1): stt_output = gr.Textbox( label="Transcription", lines=12, placeholder="Your transcribed text will appear here...", ) stt_btn.click( fn=transcribe_audio, inputs=[stt_audio, stt_language], outputs=stt_output, ) # ── TAB 2: Text to Speech ────────────────────────────────────────── with gr.TabItem("🔊 Text → Speech", elem_classes=["tabs-container"]): gr.Markdown(""" **Type text** and Voxtral Mini TTS converts it to natural speech. Optionally paste a **Voice ID** from the Voice Cloning tab to use your own cloned voice. """) with gr.Row(): with gr.Column(scale=1): tts_text = gr.Textbox( label="Text to speak", lines=8, placeholder="Enter text here (max ~300 words for best results). Avoid markdown or special characters.", value="Hello! Welcome to Voxtral Studio, powered by Mistral AI. This is a demonstration of high-quality neural text-to-speech synthesis.", ) with gr.Row(): tts_voice_id = gr.Dropdown( label="Select a Mistral Voice or Your Clones", choices=INITIAL_VOICES, value=INITIAL_VOICES[0][1] if INITIAL_VOICES else None, allow_custom_value=True, scale=3, ) voices_btn = gr.Button("🔄 Refresh List", size="sm", scale=1) voices_list_out = gr.Markdown(visible=False) # Hide text list since we use dropdown now tts_ref_audio = gr.Audio( label="OR: Reference Audio (Set voice tone instantly)", sources=["upload", "microphone"], type="filepath", ) tts_format = gr.Dropdown( choices=["mp3", "wav", "flac", "opus"], value="mp3", label="Audio Format", ) tts_btn = gr.Button("🎵 Generate Speech", variant="primary") with gr.Column(scale=1): tts_audio_out = gr.Audio( label="Generated Audio", type="filepath", elem_classes=["audio-component"], ) tts_status = gr.Markdown(elem_classes=["status-text"]) tts_btn.click( fn=synthesize_speech, inputs=[tts_text, tts_voice_id, tts_ref_audio, tts_format], outputs=[tts_audio_out, tts_status], ) voices_btn.click( fn=lambda: gr.update(choices=get_voice_choices()), inputs=[], outputs=tts_voice_id, ) # ── TAB 3: Voice Cloning ─────────────────────────────────────────── with gr.TabItem("🧬 Voice Cloning", elem_classes=["tabs-container"]): gr.Markdown(""" **Clone any voice** by uploading a short audio sample (10–60 seconds recommended). The model will save it as a reusable voice. Copy the Voice ID and paste it in the TTS tab. > ⚠️ Only clone voices with **explicit consent**. Do not impersonate real people. """) with gr.Row(): with gr.Column(scale=1): clone_audio = gr.Audio( label="Voice Sample (upload or record)", sources=["microphone", "upload"], type="filepath", elem_classes=["audio-component"], ) clone_url = gr.Textbox( label="OR: Media URL (TikTok, Twitter, or direct .MP3/.WAV link)", placeholder="https://...link_to_audio_or_video...", ) clone_name = gr.Textbox( label="Voice Name", placeholder="e.g. my-assistant-voice", ) clone_gender = gr.Dropdown( choices=["Female", "Male"], value="Female", label="Gender", ) clone_langs = gr.Textbox( label="Languages (comma-separated)", value="en", placeholder="en, fr, es", ) clone_btn = gr.Button("🧬 Clone Voice", variant="primary") with gr.Column(scale=1): clone_result = gr.Markdown( value="Your new Voice ID will appear here after cloning.", elem_classes=["status-text"], ) clone_btn.click( fn=clone_voice, inputs=[clone_audio, clone_url, clone_name, clone_gender, clone_langs], outputs=[clone_result, tts_voice_id], ) gr.HTML("""
Built with Mistral Voxtral · Hugging Face Spaces
""") if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860)