voxtral-studio / app.py
mehdilaalali's picture
fix: remove demo.load to prevent Gradio 6 tab component race condition freezing UI
405e984 verified
raw
history blame
21.7 kB
import os
import base64
import tempfile
import gradio as gr
from pathlib import Path
import requests
from mistralai.client import Mistral
def list_user_voices():
# Keep the textual list for debugging if needed, or remove it. Let's keep it but improve it.
try:
client = get_client()
result = client.audio.voices.list(limit=100, offset=0)
if result.total == 0:
return "No voices found in your account."
out = f"**Total Voices:** {result.total}\n\n"
for voice in result.items:
out += f"- **{voice.name}**\n - ID: `{voice.id}`\n - Languages: {', '.join(voice.languages) if hasattr(voice, 'languages') else 'unknown'}\n"
return out
except Exception as e:
return f"Error fetching voices: {str(e)}"
def get_voice_choices():
try:
client = get_client()
res = client.audio.voices.list(limit=100, offset=0)
# Filter for Official Mistral Voices (Paul, Oliver, Jane, Marie) so we hide randomly cloned user voices
official_names = ("Paul", "Oliver", "Jane", "Marie")
official = []
for v in res.items:
if v.name.startswith(official_names) and " - " in v.name:
official.append((f"{v.name}", v.id))
return official
except:
return []
# ─── Client ───────────────────────────────────────────────────────────────────
def get_client():
api_key = os.environ.get("MISTRAL_API_KEY")
if not api_key:
raise gr.Error("MISTRAL_API_KEY secret is not set. Add it in Space Settings β†’ Secrets.")
return Mistral(api_key=api_key)
# ─── STT ──────────────────────────────────────────────────────────────────────
def transcribe_audio(audio_path, language):
"""Convert audio file β†’ text using Voxtral Mini Transcribe."""
if audio_path is None:
return "⚠️ Please record or upload an audio file first."
try:
client = get_client()
lang_param = language if language != "Auto-detect" else None
with open(audio_path, "rb") as f:
kwargs = dict(
model="voxtral-mini-latest",
file={"content": f, "file_name": Path(audio_path).name},
)
if lang_param:
kwargs["language"] = lang_param
response = client.audio.transcriptions.complete(**kwargs)
return response.text
except Exception as e:
return f"❌ Error: {str(e)}"
# ─── TTS ──────────────────────────────────────────────────────────────────────
BUILTIN_VOICES = {
"Default (no voice clone)": None,
}
def synthesize_speech(text, voice_id_input, ref_audio_path, audio_format):
"""Convert text β†’ speech using Voxtral Mini TTS."""
if not text.strip():
return None, "⚠️ Please enter some text."
try:
client = get_client()
voice_id = voice_id_input.strip() if voice_id_input and voice_id_input.strip() else None
kwargs = dict(
model="voxtral-mini-tts-2603",
input=text,
response_format=audio_format,
)
if voice_id:
kwargs["voice_id"] = voice_id
# Add Reference Audio for Zero-shot tone/voice cloning
if ref_audio_path:
with open(ref_audio_path, "rb") as f:
ref_audio_b64 = base64.b64encode(f.read()).decode("utf-8")
kwargs["ref_audio"] = ref_audio_b64
if not voice_id and not ref_audio_path:
raise gr.Error("Mistral API requires a voice! Please either upload a short 'Reference Audio' clip to define the voice tone zero-shot, OR paste a valid Voice ID you cloned in the Voice Cloning tab. There are no built-in standard voices.")
response = client.audio.speech.complete(**kwargs)
audio_bytes = base64.b64decode(response.audio_data)
# Write to temp file
suffix = f".{audio_format}"
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix)
tmp.write(audio_bytes)
tmp.close()
return tmp.name, f"βœ… Generated {len(audio_bytes):,} bytes of {audio_format.upper()} audio."
except Exception as e:
return None, f"❌ Error: {str(e)}"
# ─── Voice Cloning ────────────────────────────────────────────────────────────
def clone_voice(audio_path, url_input, voice_name, gender, languages_str):
"""Upload a sample audio or provide a URL to create a reusable cloned voice."""
if not audio_path and not url_input.strip():
return "⚠️ Please upload an audio clip or provide a media URL.", gr.update()
if not voice_name.strip():
return "⚠️ Please enter a name for the voice.", gr.update()
final_audio_path = audio_path
try:
# If URL is provided, handle direct links or yt-dlp
if url_input.strip():
url = url_input.strip()
base_out = tempfile.mktemp()
# If it's a direct audio file link, bypass yt-dlp and download it directly
if url.lower().endswith(('.mp3', '.wav', '.flac', '.ogg', '.m4a')):
try:
ext = url.split('.')[-1]
final_audio_path = f"{base_out}.{ext}"
with requests.get(url, stream=True, timeout=15) as r:
r.raise_for_status()
with open(final_audio_path, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
except Exception as e:
return f"❌ Error downloading direct audio link: {str(e)}", gr.update()
# Otherwise use yt-dlp for TikTok, Twitter, YouTube (if not blocked), etc.
else:
import yt_dlp
ydl_opts = {
'format': 'bestaudio/best',
'outtmpl': base_out + '.%(ext)s',
'quiet': True,
'postprocessors': [{
'key': 'FFmpegExtractAudio',
'preferredcodec': 'mp3',
'preferredquality': '128',
}],
'postprocessor_args': [
'-t', '60' # Limit to first 60 seconds
],
}
try:
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info = ydl.extract_info(url, download=True)
final_audio_path = base_out + '.mp3'
except Exception as e:
err_msg = str(e)
if "Sign in to confirm" in err_msg or "bot" in err_msg.lower() or "youtube" in err_msg.lower():
raise gr.Error("YouTube blocked the Hugging Face Server. Please use a TikTok/Twitter link, OR paste a direct .MP3 URL, OR upload the file manually.")
else:
raise gr.Error(f"Video download failed: {err_msg}")
client = get_client()
sample_b64 = base64.b64encode(Path(final_audio_path).read_bytes()).decode()
langs = [l.strip() for l in languages_str.split(",") if l.strip()] or ["en"]
voice = client.audio.voices.create(
name=voice_name.strip(),
sample_audio=sample_b64,
sample_filename=Path(final_audio_path).name,
languages=langs,
gender=gender.lower(),
)
# Clean up downloaded file
if url_input.strip() and os.path.exists(final_audio_path):
try: os.remove(final_audio_path)
except: pass
# Build new choices specifically for this user session: Official Voices + Their new clone
new_session_choices = get_voice_choices() + [(f"{voice.name} (Custom Session Clone)", voice.id)]
return (
f"βœ… Voice created!\n\n**Voice ID:** `{voice.id}`\n**Name:** {voice.name}\n**Languages:** {', '.join(voice.languages)}\n\nThis voice has been automatically selected in the Text-to-Speech tab!",
gr.update(choices=new_session_choices, value=voice.id)
)
except Exception as e:
return f"❌ Error: {str(e)}", gr.update()
# ─── UI ───────────────────────────────────────────────────────────────────────
LANGUAGES = [
"Auto-detect", "en", "fr", "es", "de", "it", "pt",
"zh", "ja", "ko", "ar", "ru", "hi", "nl"
]
css = """
@import url('https://fonts.googleapis.com/css2?family=Outfit:wght@300;400;500;600;700;800&display=swap');
* { font-family: 'Outfit', sans-serif; }
body, .gradio-container {
background: radial-gradient(circle at 10% 20%, #150f24 0%, #07040d 100%) !important;
min-height: 100vh;
}
.gradio-container {
max-width: 1050px !important;
margin: 0 auto !important;
}
/* App Header */
.app-header {
text-align: center;
padding: 3.5rem 1rem 1.5rem;
position: relative;
z-index: 10;
}
.app-header h1 {
font-size: 3.2rem;
font-weight: 800;
letter-spacing: -1.5px;
background: linear-gradient(135deg, #c084fc 0%, #ec4899 50%, #facc15 100%);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
background-clip: text;
margin-bottom: 0.5rem;
animation: glow-pulse 3s infinite alternate;
}
.app-header p {
color: #94a3b8;
font-size: 1.25rem;
font-weight: 500;
margin-top: 0;
}
.highlight-badge {
background: linear-gradient(135deg, #f59e0b, #ef4444);
color: white;
padding: 2px 8px;
border-radius: 8px;
font-size: 0.8rem;
font-weight: 800;
vertical-align: top;
margin-left: 10px;
box-shadow: 0 0 10px rgba(239, 68, 68, 0.6);
}
/* Glass panel wrapper */
div.tabs-container, .panel-box {
background: rgba(255, 255, 255, 0.02) !important;
border: 1px solid rgba(255, 255, 255, 0.05) !important;
border-radius: 20px !important;
box-shadow: 0 10px 40px 0 rgba(0, 0, 0, 0.4) !important;
backdrop-filter: blur(15px) !important;
-webkit-backdrop-filter: blur(15px) !important;
overflow: hidden;
}
/* Tabs */
.tab-nav {
border-bottom: 1px solid rgba(255,255,255,0.05) !important;
padding: 10px 10px 0 10px !important;
}
.tab-nav button {
background: transparent !important;
border: none !important;
border-bottom: 3px solid transparent !important;
color: #64748b !important;
border-radius: 0 !important;
margin: 0 !important;
padding: 1rem 2rem !important;
font-weight: 600 !important;
font-size: 1.05rem !important;
transition: all 0.3s ease !important;
box-shadow: none !important;
}
.tab-nav button.selected, .tab-nav button:hover {
color: #f8fafc !important;
border-bottom: 3px solid #ec4899 !important;
box-shadow: 0 20px 20px -20px rgba(236,72,153,0.3) !important;
background: linear-gradient(0deg, rgba(236,72,153,0.1) 0%, transparent 100%) !important;
}
/* Inputs & Textareas */
textarea, input[type="text"], .dropdown-menu {
background: rgba(0,0,0,0.25) !important;
border: 1px solid rgba(255,255,255,0.08) !important;
border-radius: 14px !important;
color: #f8fafc !important;
font-size: 1.05rem !important;
transition: all 0.2s ease !important;
padding: 0.75rem !important;
}
textarea:focus, input[type="text"]:focus {
border-color: #ec4899 !important;
box-shadow: 0 0 0 3px rgba(236,72,153,0.2) !important;
background: rgba(0,0,0,0.4) !important;
}
/* Override Gradio layout borders */
div.form {
border: none !important;
box-shadow: none !important;
background: transparent !important;
}
/* Cool gradient buttons */
button.primary {
background: linear-gradient(135deg, #a78bfa 0%, #ec4899 100%) !important;
border: none !important;
color: white !important;
border-radius: 14px !important;
font-weight: 700 !important;
font-size: 1.15rem !important;
padding: 0.9rem !important;
letter-spacing: 0.5px !important;
box-shadow: 0 4px 15px rgba(236,72,153,0.3) !important;
transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1) !important;
}
button.primary:hover {
transform: translateY(-3px) !important;
box-shadow: 0 8px 25px rgba(236,72,153,0.5) !important;
}
/* Secondary Button */
button.secondary {
background: rgba(255,255,255,0.05) !important;
border: 1px solid rgba(255,255,255,0.1) !important;
border-radius: 14px !important;
color: #e2e8f0 !important;
transition: all 0.2s ease !important;
font-weight: 600 !important;
}
button.secondary:hover {
background: rgba(255,255,255,0.15) !important;
border-color: rgba(255,255,255,0.3) !important;
}
/* Status text box */
.status-text {
background: rgba(0,0,0,0.4);
padding: 1.5rem;
border-radius: 16px;
border-left: 5px solid #a78bfa;
color: #e2e8f0;
font-size: 1rem;
line-height: 1.6;
}
/* Highlight labels */
label span {
color: #cbd5e1 !important;
font-weight: 500 !important;
letter-spacing: 0.2px !important;
}
/* Clean audio components */
.audio-component {
border-radius: 16px !important;
overflow: hidden !important;
border: 1px solid rgba(255,255,255,0.05) !important;
}
/* Global Animations */
@keyframes glow-pulse {
0% { filter: drop-shadow(0 0 15px rgba(167,139,250,0.3)); }
100% { filter: drop-shadow(0 0 30px rgba(236,72,153,0.6)); }
}
/* Footer Hide */
footer { display: none !important; }
"""
INITIAL_VOICES = get_voice_choices()
with gr.Blocks(title="Voxtral Studio β€” Mistral AI Audio", css=css) as demo:
gr.HTML("""
<div class="app-header">
<h1>πŸŽ™οΈ Voxtral Studio <span class="highlight-badge">VOICE CLONING</span></h1>
<p>Powered by Mistral AI Β· STT & Elite Text-to-Speech + Instant Zero-Shot Cloning</p>
</div>
""")
with gr.Tabs():
# ── TAB 1: Speech to Text ──────────────────────────────────────────
with gr.TabItem("🎀 Speech β†’ Text"):
gr.Markdown("""
**Upload or record audio** and Voxtral Mini will transcribe it with high accuracy.
Supports 13 languages, handles noise, and can detect the language automatically.
""")
with gr.Row():
with gr.Column(scale=1):
stt_audio = gr.Audio(
label="Audio Input",
sources=["microphone", "upload"],
type="filepath",
elem_classes=["audio-component"],
)
stt_language = gr.Dropdown(
choices=LANGUAGES,
value="Auto-detect",
label="Language",
)
stt_btn = gr.Button("✨ Transcribe", variant="primary")
with gr.Column(scale=1):
stt_output = gr.Textbox(
label="Transcription",
lines=12,
placeholder="Your transcribed text will appear here...",
)
stt_btn.click(
fn=transcribe_audio,
inputs=[stt_audio, stt_language],
outputs=stt_output,
)
# ── TAB 2: Text to Speech ──────────────────────────────────────────
with gr.TabItem("πŸ”Š Text β†’ Speech", elem_classes=["tabs-container"]):
gr.Markdown("""
**Type text** and Voxtral Mini TTS converts it to natural speech.
Optionally paste a **Voice ID** from the Voice Cloning tab to use your own cloned voice.
""")
with gr.Row():
with gr.Column(scale=1):
tts_text = gr.Textbox(
label="Text to speak",
lines=8,
placeholder="Enter text here (max ~300 words for best results). Avoid markdown or special characters.",
value="Hello! Welcome to Voxtral Studio, powered by Mistral AI. This is a demonstration of high-quality neural text-to-speech synthesis.",
)
with gr.Row():
tts_voice_id = gr.Dropdown(
label="Select a Mistral Voice or Your Clones",
choices=INITIAL_VOICES,
value=INITIAL_VOICES[0][1] if INITIAL_VOICES else None,
allow_custom_value=True,
scale=3,
)
voices_btn = gr.Button("πŸ”„ Refresh List", size="sm", scale=1)
voices_list_out = gr.Markdown(visible=False) # Hide text list since we use dropdown now
tts_ref_audio = gr.Audio(
label="OR: Reference Audio (Set voice tone instantly)",
sources=["upload", "microphone"],
type="filepath",
)
tts_format = gr.Dropdown(
choices=["mp3", "wav", "flac", "opus"],
value="mp3",
label="Audio Format",
)
tts_btn = gr.Button("🎡 Generate Speech", variant="primary")
with gr.Column(scale=1):
tts_audio_out = gr.Audio(
label="Generated Audio",
type="filepath",
elem_classes=["audio-component"],
)
tts_status = gr.Markdown(elem_classes=["status-text"])
tts_btn.click(
fn=synthesize_speech,
inputs=[tts_text, tts_voice_id, tts_ref_audio, tts_format],
outputs=[tts_audio_out, tts_status],
)
voices_btn.click(
fn=lambda: gr.update(choices=get_voice_choices()),
inputs=[],
outputs=tts_voice_id,
)
# ── TAB 3: Voice Cloning ───────────────────────────────────────────
with gr.TabItem("🧬 Voice Cloning", elem_classes=["tabs-container"]):
gr.Markdown("""
**Clone any voice** by uploading a short audio sample (10–60 seconds recommended).
The model will save it as a reusable voice. Copy the Voice ID and paste it in the TTS tab.
> ⚠️ Only clone voices with **explicit consent**. Do not impersonate real people.
""")
with gr.Row():
with gr.Column(scale=1):
clone_audio = gr.Audio(
label="Voice Sample (upload or record)",
sources=["microphone", "upload"],
type="filepath",
elem_classes=["audio-component"],
)
clone_url = gr.Textbox(
label="OR: Media URL (TikTok, Twitter, or direct .MP3/.WAV link)",
placeholder="https://...link_to_audio_or_video...",
)
clone_name = gr.Textbox(
label="Voice Name",
placeholder="e.g. my-assistant-voice",
)
clone_gender = gr.Dropdown(
choices=["Female", "Male"],
value="Female",
label="Gender",
)
clone_langs = gr.Textbox(
label="Languages (comma-separated)",
value="en",
placeholder="en, fr, es",
)
clone_btn = gr.Button("🧬 Clone Voice", variant="primary")
with gr.Column(scale=1):
clone_result = gr.Markdown(
value="Your new Voice ID will appear here after cloning.",
elem_classes=["status-text"],
)
clone_btn.click(
fn=clone_voice,
inputs=[clone_audio, clone_url, clone_name, clone_gender, clone_langs],
outputs=[clone_result, tts_voice_id],
)
gr.HTML("""
<div style="text-align:center; padding: 1.5rem; color: #475569; font-size: 0.85rem;">
Built with <a href="https://docs.mistral.ai/capabilities/audio/" target="_blank" style="color:#a78bfa;">Mistral Voxtral</a>
Β· <a href="https://huggingface.co/" target="_blank" style="color:#60a5fa;">Hugging Face Spaces</a>
</div>
""")
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860)