Spaces:

Duplicated from mehdilaalali/voxtral-studio

codexxx
/

voxtral-studio

Sleeping

App Files Files Community

voxtral-studio / app.py

mehdilaalali's picture

fix: remove demo.load to prevent Gradio 6 tab component race condition freezing UI

405e984 verified about 2 months ago

21.7 kB

	import os
	import base64
	import tempfile
	import gradio as gr
	from pathlib import Path
	import requests
	from mistralai.client import Mistral

	def list_user_voices():
	# Keep the textual list for debugging if needed, or remove it. Let's keep it but improve it.
	try:
	client = get_client()
	result = client.audio.voices.list(limit=100, offset=0)
	if result.total == 0:
	return "No voices found in your account."
	out = f"Total Voices: {result.total}\n\n"
	for voice in result.items:
	out += f"- {voice.name}\n - ID: `{voice.id}`\n - Languages: {', '.join(voice.languages) if hasattr(voice, 'languages') else 'unknown'}\n"
	return out
	except Exception as e:
	return f"Error fetching voices: {str(e)}"

	def get_voice_choices():
	try:
	client = get_client()
	res = client.audio.voices.list(limit=100, offset=0)
	# Filter for Official Mistral Voices (Paul, Oliver, Jane, Marie) so we hide randomly cloned user voices
	official_names = ("Paul", "Oliver", "Jane", "Marie")
	official = []
	for v in res.items:
	if v.name.startswith(official_names) and " - " in v.name:
	official.append((f"{v.name}", v.id))
	return official
	except:
	return []


	# ─── Client ───────────────────────────────────────────────────────────────────
	def get_client():
	api_key = os.environ.get("MISTRAL_API_KEY")
	if not api_key:
	raise gr.Error("MISTRAL_API_KEY secret is not set. Add it in Space Settings → Secrets.")
	return Mistral(api_key=api_key)


	# ─── STT ──────────────────────────────────────────────────────────────────────
	def transcribe_audio(audio_path, language):
	"""Convert audio file → text using Voxtral Mini Transcribe."""
	if audio_path is None:
	return "⚠️ Please record or upload an audio file first."
	try:
	client = get_client()
	lang_param = language if language != "Auto-detect" else None
	with open(audio_path, "rb") as f:
	kwargs = dict(
	model="voxtral-mini-latest",
	file={"content": f, "file_name": Path(audio_path).name},
	)
	if lang_param:
	kwargs["language"] = lang_param
	response = client.audio.transcriptions.complete(**kwargs)
	return response.text
	except Exception as e:
	return f"❌ Error: {str(e)}"


	# ─── TTS ──────────────────────────────────────────────────────────────────────
	BUILTIN_VOICES = {
	"Default (no voice clone)": None,
	}

	def synthesize_speech(text, voice_id_input, ref_audio_path, audio_format):
	"""Convert text → speech using Voxtral Mini TTS."""
	if not text.strip():
	return None, "⚠️ Please enter some text."
	try:
	client = get_client()
	voice_id = voice_id_input.strip() if voice_id_input and voice_id_input.strip() else None

	kwargs = dict(
	model="voxtral-mini-tts-2603",
	input=text,
	response_format=audio_format,
	)
	if voice_id:
	kwargs["voice_id"] = voice_id

	# Add Reference Audio for Zero-shot tone/voice cloning
	if ref_audio_path:
	with open(ref_audio_path, "rb") as f:
	ref_audio_b64 = base64.b64encode(f.read()).decode("utf-8")
	kwargs["ref_audio"] = ref_audio_b64
	if not voice_id and not ref_audio_path:
	raise gr.Error("Mistral API requires a voice! Please either upload a short 'Reference Audio' clip to define the voice tone zero-shot, OR paste a valid Voice ID you cloned in the Voice Cloning tab. There are no built-in standard voices.")

	response = client.audio.speech.complete(**kwargs)
	audio_bytes = base64.b64decode(response.audio_data)

	# Write to temp file
	suffix = f".{audio_format}"
	tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix)
	tmp.write(audio_bytes)
	tmp.close()
	return tmp.name, f"✅ Generated {len(audio_bytes):,} bytes of {audio_format.upper()} audio."
	except Exception as e:
	return None, f"❌ Error: {str(e)}"


	# ─── Voice Cloning ────────────────────────────────────────────────────────────
	def clone_voice(audio_path, url_input, voice_name, gender, languages_str):
	"""Upload a sample audio or provide a URL to create a reusable cloned voice."""
	if not audio_path and not url_input.strip():
	return "⚠️ Please upload an audio clip or provide a media URL.", gr.update()
	if not voice_name.strip():
	return "⚠️ Please enter a name for the voice.", gr.update()

	final_audio_path = audio_path

	try:
	# If URL is provided, handle direct links or yt-dlp
	if url_input.strip():
	url = url_input.strip()
	base_out = tempfile.mktemp()

	# If it's a direct audio file link, bypass yt-dlp and download it directly
	if url.lower().endswith(('.mp3', '.wav', '.flac', '.ogg', '.m4a')):
	try:
	ext = url.split('.')[-1]
	final_audio_path = f"{base_out}.{ext}"
	with requests.get(url, stream=True, timeout=15) as r:
	r.raise_for_status()
	with open(final_audio_path, 'wb') as f:
	for chunk in r.iter_content(chunk_size=8192):
	f.write(chunk)
	except Exception as e:
	return f"❌ Error downloading direct audio link: {str(e)}", gr.update()
	# Otherwise use yt-dlp for TikTok, Twitter, YouTube (if not blocked), etc.
	else:
	import yt_dlp
	ydl_opts = {
	'format': 'bestaudio/best',
	'outtmpl': base_out + '.%(ext)s',
	'quiet': True,
	'postprocessors': [{
	'key': 'FFmpegExtractAudio',
	'preferredcodec': 'mp3',
	'preferredquality': '128',
	}],
	'postprocessor_args': [
	'-t', '60' # Limit to first 60 seconds
	],
	}
	try:
	with yt_dlp.YoutubeDL(ydl_opts) as ydl:
	info = ydl.extract_info(url, download=True)
	final_audio_path = base_out + '.mp3'
	except Exception as e:
	err_msg = str(e)
	if "Sign in to confirm" in err_msg or "bot" in err_msg.lower() or "youtube" in err_msg.lower():
	raise gr.Error("YouTube blocked the Hugging Face Server. Please use a TikTok/Twitter link, OR paste a direct .MP3 URL, OR upload the file manually.")
	else:
	raise gr.Error(f"Video download failed: {err_msg}")

	client = get_client()
	sample_b64 = base64.b64encode(Path(final_audio_path).read_bytes()).decode()
	langs = [l.strip() for l in languages_str.split(",") if l.strip()] or ["en"]
	voice = client.audio.voices.create(
	name=voice_name.strip(),
	sample_audio=sample_b64,
	sample_filename=Path(final_audio_path).name,
	languages=langs,
	gender=gender.lower(),
	)

	# Clean up downloaded file
	if url_input.strip() and os.path.exists(final_audio_path):
	try: os.remove(final_audio_path)
	except: pass
	# Build new choices specifically for this user session: Official Voices + Their new clone
	new_session_choices = get_voice_choices() + [(f"{voice.name} (Custom Session Clone)", voice.id)]
	return (
	f"✅ Voice created!\n\nVoice ID: `{voice.id}`\nName: {voice.name}\nLanguages: {', '.join(voice.languages)}\n\nThis voice has been automatically selected in the Text-to-Speech tab!",
	gr.update(choices=new_session_choices, value=voice.id)
	)
	except Exception as e:
	return f"❌ Error: {str(e)}", gr.update()


	# ─── UI ───────────────────────────────────────────────────────────────────────
	LANGUAGES = [
	"Auto-detect", "en", "fr", "es", "de", "it", "pt",
	"zh", "ja", "ko", "ar", "ru", "hi", "nl"
	]

	css = """
	@import url('https://fonts.googleapis.com/css2?family=Outfit:wght@300;400;500;600;700;800&display=swap');

	* { font-family: 'Outfit', sans-serif; }

	body, .gradio-container {
	background: radial-gradient(circle at 10% 20%, #150f24 0%, #07040d 100%) !important;
	min-height: 100vh;
	}

	.gradio-container {
	max-width: 1050px !important;
	margin: 0 auto !important;
	}

	/* App Header */
	.app-header {
	text-align: center;
	padding: 3.5rem 1rem 1.5rem;
	position: relative;
	z-index: 10;
	}
	.app-header h1 {
	font-size: 3.2rem;
	font-weight: 800;
	letter-spacing: -1.5px;
	background: linear-gradient(135deg, #c084fc 0%, #ec4899 50%, #facc15 100%);
	-webkit-background-clip: text;
	-webkit-text-fill-color: transparent;
	background-clip: text;
	margin-bottom: 0.5rem;
	animation: glow-pulse 3s infinite alternate;
	}
	.app-header p {
	color: #94a3b8;
	font-size: 1.25rem;
	font-weight: 500;
	margin-top: 0;
	}
	.highlight-badge {
	background: linear-gradient(135deg, #f59e0b, #ef4444);
	color: white;
	padding: 2px 8px;
	border-radius: 8px;
	font-size: 0.8rem;
	font-weight: 800;
	vertical-align: top;
	margin-left: 10px;
	box-shadow: 0 0 10px rgba(239, 68, 68, 0.6);
	}

	/* Glass panel wrapper */
	div.tabs-container, .panel-box {
	background: rgba(255, 255, 255, 0.02) !important;
	border: 1px solid rgba(255, 255, 255, 0.05) !important;
	border-radius: 20px !important;
	box-shadow: 0 10px 40px 0 rgba(0, 0, 0, 0.4) !important;
	backdrop-filter: blur(15px) !important;
	-webkit-backdrop-filter: blur(15px) !important;
	overflow: hidden;
	}

	/* Tabs */
	.tab-nav {
	border-bottom: 1px solid rgba(255,255,255,0.05) !important;
	padding: 10px 10px 0 10px !important;
	}
	.tab-nav button {
	background: transparent !important;
	border: none !important;
	border-bottom: 3px solid transparent !important;
	color: #64748b !important;
	border-radius: 0 !important;
	margin: 0 !important;
	padding: 1rem 2rem !important;
	font-weight: 600 !important;
	font-size: 1.05rem !important;
	transition: all 0.3s ease !important;
	box-shadow: none !important;
	}
	.tab-nav button.selected, .tab-nav button:hover {
	color: #f8fafc !important;
	border-bottom: 3px solid #ec4899 !important;
	box-shadow: 0 20px 20px -20px rgba(236,72,153,0.3) !important;
	background: linear-gradient(0deg, rgba(236,72,153,0.1) 0%, transparent 100%) !important;
	}

	/* Inputs & Textareas */
	textarea, input[type="text"], .dropdown-menu {
	background: rgba(0,0,0,0.25) !important;
	border: 1px solid rgba(255,255,255,0.08) !important;
	border-radius: 14px !important;
	color: #f8fafc !important;
	font-size: 1.05rem !important;
	transition: all 0.2s ease !important;
	padding: 0.75rem !important;
	}
	textarea:focus, input[type="text"]:focus {
	border-color: #ec4899 !important;
	box-shadow: 0 0 0 3px rgba(236,72,153,0.2) !important;
	background: rgba(0,0,0,0.4) !important;
	}

	/* Override Gradio layout borders */
	div.form {
	border: none !important;
	box-shadow: none !important;
	background: transparent !important;
	}

	/* Cool gradient buttons */
	button.primary {
	background: linear-gradient(135deg, #a78bfa 0%, #ec4899 100%) !important;
	border: none !important;
	color: white !important;
	border-radius: 14px !important;
	font-weight: 700 !important;
	font-size: 1.15rem !important;
	padding: 0.9rem !important;
	letter-spacing: 0.5px !important;
	box-shadow: 0 4px 15px rgba(236,72,153,0.3) !important;
	transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1) !important;
	}
	button.primary:hover {
	transform: translateY(-3px) !important;
	box-shadow: 0 8px 25px rgba(236,72,153,0.5) !important;
	}

	/* Secondary Button */
	button.secondary {
	background: rgba(255,255,255,0.05) !important;
	border: 1px solid rgba(255,255,255,0.1) !important;
	border-radius: 14px !important;
	color: #e2e8f0 !important;
	transition: all 0.2s ease !important;
	font-weight: 600 !important;
	}
	button.secondary:hover {
	background: rgba(255,255,255,0.15) !important;
	border-color: rgba(255,255,255,0.3) !important;
	}

	/* Status text box */
	.status-text {
	background: rgba(0,0,0,0.4);
	padding: 1.5rem;
	border-radius: 16px;
	border-left: 5px solid #a78bfa;
	color: #e2e8f0;
	font-size: 1rem;
	line-height: 1.6;
	}

	/* Highlight labels */
	label span {
	color: #cbd5e1 !important;
	font-weight: 500 !important;
	letter-spacing: 0.2px !important;
	}

	/* Clean audio components */
	.audio-component {
	border-radius: 16px !important;
	overflow: hidden !important;
	border: 1px solid rgba(255,255,255,0.05) !important;
	}

	/* Global Animations */
	@keyframes glow-pulse {
	0% { filter: drop-shadow(0 0 15px rgba(167,139,250,0.3)); }
	100% { filter: drop-shadow(0 0 30px rgba(236,72,153,0.6)); }
	}

	/* Footer Hide */
	footer { display: none !important; }
	"""

	INITIAL_VOICES = get_voice_choices()

	with gr.Blocks(title="Voxtral Studio — Mistral AI Audio", css=css) as demo:

	gr.HTML("""
	<div class="app-header">
	<h1>🎙️ Voxtral Studio <span class="highlight-badge">VOICE CLONING</span></h1>
	<p>Powered by Mistral AI · STT & Elite Text-to-Speech + Instant Zero-Shot Cloning</p>

	</div>
	""")

	with gr.Tabs():

	# ── TAB 1: Speech to Text ──────────────────────────────────────────
	with gr.TabItem("🎤 Speech → Text"):
	gr.Markdown("""
	Upload or record audio and Voxtral Mini will transcribe it with high accuracy.
	Supports 13 languages, handles noise, and can detect the language automatically.
	""")
	with gr.Row():
	with gr.Column(scale=1):
	stt_audio = gr.Audio(
	label="Audio Input",
	sources=["microphone", "upload"],
	type="filepath",
	elem_classes=["audio-component"],
	)
	stt_language = gr.Dropdown(
	choices=LANGUAGES,
	value="Auto-detect",
	label="Language",
	)
	stt_btn = gr.Button("✨ Transcribe", variant="primary")

	with gr.Column(scale=1):
	stt_output = gr.Textbox(
	label="Transcription",
	lines=12,
	placeholder="Your transcribed text will appear here...",
	)

	stt_btn.click(
	fn=transcribe_audio,
	inputs=[stt_audio, stt_language],
	outputs=stt_output,
	)

	# ── TAB 2: Text to Speech ──────────────────────────────────────────
	with gr.TabItem("🔊 Text → Speech", elem_classes=["tabs-container"]):
	gr.Markdown("""
	Type text and Voxtral Mini TTS converts it to natural speech.
	Optionally paste a Voice ID from the Voice Cloning tab to use your own cloned voice.
	""")
	with gr.Row():
	with gr.Column(scale=1):
	tts_text = gr.Textbox(
	label="Text to speak",
	lines=8,
	placeholder="Enter text here (max ~300 words for best results). Avoid markdown or special characters.",
	value="Hello! Welcome to Voxtral Studio, powered by Mistral AI. This is a demonstration of high-quality neural text-to-speech synthesis.",
	)
	with gr.Row():
	tts_voice_id = gr.Dropdown(
	label="Select a Mistral Voice or Your Clones",
	choices=INITIAL_VOICES,
	value=INITIAL_VOICES[0][1] if INITIAL_VOICES else None,
	allow_custom_value=True,
	scale=3,
	)
	voices_btn = gr.Button("🔄 Refresh List", size="sm", scale=1)

	voices_list_out = gr.Markdown(visible=False) # Hide text list since we use dropdown now

	tts_ref_audio = gr.Audio(
	label="OR: Reference Audio (Set voice tone instantly)",
	sources=["upload", "microphone"],
	type="filepath",
	)
	tts_format = gr.Dropdown(
	choices=["mp3", "wav", "flac", "opus"],
	value="mp3",
	label="Audio Format",
	)
	tts_btn = gr.Button("🎵 Generate Speech", variant="primary")

	with gr.Column(scale=1):
	tts_audio_out = gr.Audio(
	label="Generated Audio",
	type="filepath",
	elem_classes=["audio-component"],
	)
	tts_status = gr.Markdown(elem_classes=["status-text"])

	tts_btn.click(
	fn=synthesize_speech,
	inputs=[tts_text, tts_voice_id, tts_ref_audio, tts_format],
	outputs=[tts_audio_out, tts_status],
	)
	voices_btn.click(
	fn=lambda: gr.update(choices=get_voice_choices()),
	inputs=[],
	outputs=tts_voice_id,
	)

	# ── TAB 3: Voice Cloning ───────────────────────────────────────────
	with gr.TabItem("🧬 Voice Cloning", elem_classes=["tabs-container"]):
	gr.Markdown("""
	Clone any voice by uploading a short audio sample (10–60 seconds recommended).
	The model will save it as a reusable voice. Copy the Voice ID and paste it in the TTS tab.

	> ⚠️ Only clone voices with explicit consent. Do not impersonate real people.
	""")
	with gr.Row():
	with gr.Column(scale=1):
	clone_audio = gr.Audio(
	label="Voice Sample (upload or record)",
	sources=["microphone", "upload"],
	type="filepath",
	elem_classes=["audio-component"],
	)
	clone_url = gr.Textbox(
	label="OR: Media URL (TikTok, Twitter, or direct .MP3/.WAV link)",
	placeholder="https://...link_to_audio_or_video...",
	)
	clone_name = gr.Textbox(
	label="Voice Name",
	placeholder="e.g. my-assistant-voice",
	)
	clone_gender = gr.Dropdown(
	choices=["Female", "Male"],
	value="Female",
	label="Gender",
	)
	clone_langs = gr.Textbox(
	label="Languages (comma-separated)",
	value="en",
	placeholder="en, fr, es",
	)
	clone_btn = gr.Button("🧬 Clone Voice", variant="primary")

	with gr.Column(scale=1):
	clone_result = gr.Markdown(
	value="Your new Voice ID will appear here after cloning.",
	elem_classes=["status-text"],
	)

	clone_btn.click(
	fn=clone_voice,
	inputs=[clone_audio, clone_url, clone_name, clone_gender, clone_langs],
	outputs=[clone_result, tts_voice_id],
	)

	gr.HTML("""
	<div style="text-align:center; padding: 1.5rem; color: #475569; font-size: 0.85rem;">
	Built with <a href="https://docs.mistral.ai/capabilities/audio/" target="_blank" style="color:#a78bfa;">Mistral Voxtral</a>
	· <a href="https://huggingface.co/" target="_blank" style="color:#60a5fa;">Hugging Face Spaces</a>
	</div>
	""")


	if __name__ == "__main__":
	demo.launch(server_name="0.0.0.0", server_port=7860)