Spaces:

Duplicated from mehdilaalali/voxtral-studio

codexxx
/

voxtral-studio

Sleeping

App Files Files Community

voxtral-studio / app.py

codexxx's picture

Update app.py

023599d verified 25 days ago

history blame contribute delete

17.5 kB

	import os
	import base64
	import tempfile
	import gradio as gr
	import base64
	import os
	import tempfile
	import gradio as gr
	from pathlib import Path

	from core import (
	get_voice_choices,
	transcribe_audio as core_transcribe,
	synthesize_speech as core_synthesize,
	clone_voice as core_clone
	)

	# ─── Constants ────────────────────────────────────────────────────────────────
	SAMPLE_AUDIO_URL = "https://eburon.ai/sample/sample1.mp3"

	# ─── Gradio App Wrappers ──────────────────────────────────────────────────────
	def transcribe_handler(audio_path, language):
	if not audio_path:
	return "⚠️ Please record or upload an audio file first."
	try:
	return core_transcribe(audio_path, language)
	except Exception as e:
	return f"❌ Error: {str(e)}"

	def synthesize_handler(text, voice_id_input, ref_audio_path, audio_format):
	try:
	voice_id = voice_id_input.strip() if voice_id_input and voice_id_input.strip() else None
	output_path, num_bytes = core_synthesize(text, voice_id, ref_audio_path, audio_format)
	return output_path, f"✅ Generated {num_bytes:,} bytes of {audio_format.upper()} audio."
	except Exception as e:
	return None, f"❌ Error: {str(e)}"

	def clone_handler(audio_path, url_input, voice_name, gender, languages_str):
	try:
	voice = core_clone(audio_path, url_input, voice_name, gender, languages_str)
	# Build new choices specifically for this user session: Official Voices + Their new clone
	new_session_choices = get_voice_choices() + [(f"{voice.name} (Custom Session Clone)", voice.id)]
	return (
	f"✅ Voice created!\n\nVoice ID: `{voice.id}`\nName: {voice.name}\nLanguages: {', '.join(voice.languages)}\n\nThis voice has been automatically selected in the Text-to-Speech tab!",
	gr.update(choices=new_session_choices, value=voice.id)
	)
	except Exception as e:
	err_msg = str(e)
	if "Sign in to confirm" in err_msg or "bot" in err_msg.lower() or "youtube" in err_msg.lower():
	return "❌ YouTube blocked the proxy crawler. Please use a TikTok/Twitter link, OR paste a direct .MP3 URL, OR upload the file manually.", gr.update()
	return f"❌ Error: {err_msg}", gr.update()


	# ─── UI ───────────────────────────────────────────────────────────────────────
	LANGUAGES = [
	"Auto-detect", "en", "fr", "es", "de", "it", "pt",
	"zh", "ja", "ko", "ar", "ru", "hi", "nl"
	]

	css = """
	@import url('https://fonts.googleapis.com/css2?family=Outfit:wght@300;400;500;600;700;800&display=swap');

	* { font-family: 'Outfit', sans-serif; }

	body, .gradio-container {
	background: radial-gradient(circle at 10% 20%, #120d22 0%, #05030a 100%) !important;
	min-height: 100vh;
	}

	.gradio-container {
	max-width: 1050px !important;
	margin: 0 auto !important;
	}

	/* App Header */
	.app-header {
	text-align: center;
	padding: 3.5rem 1rem 1.5rem;
	position: relative;
	z-index: 10;
	}
	.app-header h1 {
	font-size: 3.2rem;
	font-weight: 800;
	letter-spacing: -1.5px;
	background: linear-gradient(135deg, #8b5cf6 0%, #06b6d4 50%, #f59e0b 100%);
	-webkit-background-clip: text;
	-webkit-text-fill-color: transparent;
	background-clip: text;
	margin-bottom: 0.5rem;
	animation: glow-pulse 3s infinite alternate;
	}
	.app-header p {
	color: #94a3b8;
	font-size: 1.25rem;
	font-weight: 500;
	margin-top: 0;
	}
	.highlight-badge {
	background: linear-gradient(135deg, #06b6d4, #8b5cf6) !important;
	-webkit-background-clip: border-box !important;
	background-clip: border-box !important;
	-webkit-text-fill-color: white !important;
	color: white !important;
	padding: 4px 10px;
	border-radius: 8px;
	font-size: 0.9rem;
	font-weight: 800;
	vertical-align: top;
	margin-left: 10px;
	box-shadow: 0 0 15px rgba(139, 92, 246, 0.45);
	display: inline-block;
	letter-spacing: 0.5px;
	}

	/* Glass panel wrapper */
	div.tabs-container, .panel-box {
	background: rgba(255, 255, 255, 0.02) !important;
	border: 1px solid rgba(255, 255, 255, 0.05) !important;
	border-radius: 20px !important;
	box-shadow: 0 10px 40px 0 rgba(0, 0, 0, 0.4) !important;
	overflow: visible !important;
	}

	/* Tabs */
	.tab-nav {
	border-bottom: 1px solid rgba(255,255,255,0.05) !important;
	padding: 10px 10px 0 10px !important;
	}
	.tab-nav button {
	background: transparent !important;
	border: none !important;
	border-bottom: 3px solid transparent !important;
	color: #64748b !important;
	border-radius: 0 !important;
	margin: 0 !important;
	padding: 1rem 2rem !important;
	font-weight: 600 !important;
	font-size: 1.05rem !important;
	transition: all 0.3s ease !important;
	box-shadow: none !important;
	}
	.tab-nav button.selected, .tab-nav button:hover {
	color: #f8fafc !important;
	border-bottom: 3px solid #06b6d4 !important;
	box-shadow: 0 20px 20px -20px rgba(6,182,212,0.30) !important;
	background: linear-gradient(0deg, rgba(6,182,212,0.10) 0%, transparent 100%) !important;
	}

	/* Override Gradio layout borders */
	div.form {
	border: none !important;
	box-shadow: none !important;
	background: transparent !important;
	}

	/* Primary Buttons */
	button.primary {
	background: linear-gradient(135deg, #8b5cf6 0%, #06b6d4 100%) !important;
	border: none !important;
	color: white !important;
	border-radius: 14px !important;
	font-weight: 700 !important;
	font-size: 1.15rem !important;
	padding: 0.9rem !important;
	letter-spacing: 0.5px !important;
	box-shadow: 0 4px 15px rgba(6,182,212,0.25) !important;
	transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1) !important;
	}
	button.primary:hover {
	transform: translateY(-3px) !important;
	box-shadow: 0 8px 25px rgba(6,182,212,0.40) !important;
	}

	/* Secondary Button */
	button.secondary {
	background: rgba(255,255,255,0.05) !important;
	border: 1px solid rgba(255,255,255,0.1) !important;
	border-radius: 14px !important;
	color: #e2e8f0 !important;
	transition: all 0.2s ease !important;
	font-weight: 600 !important;
	}
	button.secondary:hover {
	background: rgba(255,255,255,0.15) !important;
	border-color: rgba(255,255,255,0.3) !important;
	}

	/* Status text box */
	.status-text {
	background: rgba(0,0,0,0.4);
	padding: 1.5rem;
	border-radius: 16px;
	border-left: 5px solid #06b6d4;
	color: #e2e8f0;
	font-size: 1rem;
	line-height: 1.6;
	}

	/* Highlight labels */
	label span {
	color: #cbd5e1 !important;
	font-weight: 500 !important;
	letter-spacing: 0.2px !important;
	}

	/* Clean audio components */
	.audio-component {
	border-radius: 16px !important;
	overflow: hidden !important;
	border: 1px solid rgba(255,255,255,0.05) !important;
	}

	/* Sample tab */
	.sample-card {
	background: rgba(255,255,255,0.03);
	border: 1px solid rgba(255,255,255,0.08);
	border-radius: 18px;
	padding: 1.5rem;
	box-shadow: 0 10px 30px rgba(0,0,0,0.25);
	}
	.sample-card h3 {
	margin-top: 0;
	color: #f8fafc;
	font-size: 1.2rem;
	font-weight: 700;
	}
	.sample-card p {
	color: #94a3b8;
	margin-bottom: 1rem;
	}
	.sample-audio-wrap {
	background: rgba(0,0,0,0.35);
	border: 1px solid rgba(255,255,255,0.06);
	border-radius: 16px;
	padding: 1rem;
	}
	.sample-audio-wrap audio {
	width: 100%;
	outline: none;
	border-radius: 12px;
	}

	/* Global Animations */
	@keyframes glow-pulse {
	0% { filter: drop-shadow(0 0 15px rgba(139, 92, 246, 0.25)); }
	100% { filter: drop-shadow(0 0 30px rgba(6, 182, 212, 0.45)); }
	}

	/* Footer Hide */
	footer { display: none !important; }
	"""

	INITIAL_VOICES = get_voice_choices()

	with gr.Blocks(title="Eburon Voice Studio", css=css) as demo:

	gr.HTML("""
	<div class="app-header">
	<h1>🎙️ Eburon Voice Studio <span class="highlight-badge">VOICE LAB</span></h1>
	<p>Powered by Eburon Audio · Speech-to-Text, Text-to-Speech, and Instant Voice Cloning</p>
	<div style="margin-top: 15px;">
	<a href="https://eburon.ai" target="_blank" style="text-decoration: none;">
	<span style="background: linear-gradient(135deg, #8b5cf6, #06b6d4); color: white; padding: 6px 14px; border-radius: 20px; font-weight: bold; font-size: 0.9rem; box-shadow: 0 4px 15px rgba(6, 182, 212, 0.35); display: inline-block; cursor: pointer; transition: transform 0.2s;">
	✨ Visit Eburon
	</span>
	</a>
	</div>
	</div>
	""")

	with gr.Tabs():

	# ── TAB 1: Speech to Text ──────────────────────────────────────────
	with gr.TabItem("🎤 Speech → Text", elem_classes=["tabs-container"]):
	gr.Markdown("""
	Upload or record audio and Eburon Audio will transcribe it with high accuracy.
	Supports multiple languages, handles noisy inputs, and can detect the language automatically.
	""")
	with gr.Row():
	with gr.Column(scale=1):
	stt_audio = gr.Audio(
	label="Audio Input",
	sources=["microphone", "upload"],
	type="filepath",
	elem_classes=["audio-component"],
	)
	stt_language = gr.Dropdown(
	choices=LANGUAGES,
	value="Auto-detect",
	label="Language",
	)
	stt_btn = gr.Button("✨ Transcribe", variant="primary")

	with gr.Column(scale=1):
	stt_output = gr.Textbox(
	label="Transcription",
	lines=12,
	placeholder="Your transcribed text will appear here...",
	)

	stt_btn.click(
	fn=transcribe_handler,
	inputs=[stt_audio, stt_language],
	outputs=stt_output,
	)

	# ── TAB 2: Text to Speech ──────────────────────────────────────────
	with gr.TabItem("🔊 Text → Speech", elem_classes=["tabs-container"]):
	gr.Markdown("""
	Type text and Eburon Audio converts it into natural speech.
	Optionally paste a Voice ID from the Voice Cloning tab to use your own cloned voice.
	""")
	with gr.Row():
	with gr.Column(scale=1):
	tts_text = gr.Textbox(
	label="Text to speak",
	lines=8,
	placeholder="Enter text here (max ~300 words for best results). Avoid markdown or special characters.",
	value="And that is what makes this moment so important.\n\nBecause we are no longer in the phase where AI is only a spectacle.\nWe are entering the phase where it must become dependable.\nIntegrated.\nEfficient.\nAnd truly beneficial.\n\nThe projects that matter now will be the ones that combine vision with grounded execution.\nThe ones that understand cost as well as capability.\nLatency as well as intelligence.\nHuman need as well as model performance.",
	)
	with gr.Row():
	tts_voice_id = gr.Dropdown(
	label="Select a Voice or Your Session Clones",
	choices=INITIAL_VOICES,
	value=INITIAL_VOICES[0][1] if INITIAL_VOICES else None,
	allow_custom_value=True,
	scale=3,
	)
	voices_btn = gr.Button("🔄 Refresh List", size="sm", scale=1)

	voices_list_out = gr.Markdown(visible=False)

	tts_ref_audio = gr.Audio(
	label="OR: Reference Audio (Set voice tone instantly)",
	sources=["upload", "microphone"],
	type="filepath",
	)
	tts_format = gr.Dropdown(
	choices=["mp3", "wav", "flac", "opus"],
	value="mp3",
	label="Audio Format",
	)
	tts_btn = gr.Button("🎵 Generate Speech", variant="primary")

	with gr.Column(scale=1):
	tts_audio_out = gr.Audio(
	label="Generated Audio",
	type="filepath",
	elem_classes=["audio-component"],
	)
	tts_status = gr.Markdown(elem_classes=["status-text"])

	tts_btn.click(
	fn=synthesize_handler,
	inputs=[tts_text, tts_voice_id, tts_ref_audio, tts_format],
	outputs=[tts_audio_out, tts_status],
	)
	voices_btn.click(
	fn=lambda: gr.update(choices=get_voice_choices()),
	inputs=[],
	outputs=tts_voice_id,
	)

	# ── TAB 3: Voice Cloning ───────────────────────────────────────────
	with gr.TabItem("🧬 Voice Cloning", elem_classes=["tabs-container"]):
	gr.Markdown("""
	Clone any voice by uploading a short audio sample (10–60 seconds recommended).
	The system will save it as a reusable voice. Copy the Voice ID and paste it in the Text-to-Speech tab.

	> ⚠️ Only clone voices with explicit consent.
	""")
	with gr.Row():
	with gr.Column(scale=1):
	clone_audio = gr.Audio(
	label="Voice Sample (upload or record)",
	sources=["microphone", "upload"],
	type="filepath",
	elem_classes=["audio-component"],
	)
	clone_url = gr.Textbox(
	label="OR: Media URL (TikTok, Twitter, or direct .MP3/.WAV link)",
	placeholder="https://...link_to_audio_or_video...",
	)
	clone_name = gr.Textbox(
	label="Voice Name",
	placeholder="e.g. eburon-assistant-voice",
	)
	clone_gender = gr.Dropdown(
	choices=["Female", "Male"],
	value="Female",
	label="Gender",
	)
	clone_langs = gr.Textbox(
	label="Languages (comma-separated)",
	value="en",
	placeholder="en, fr, es",
	)
	clone_btn = gr.Button("🧬 Clone Voice", variant="primary")

	with gr.Column(scale=1):
	clone_result = gr.Markdown(
	value="Your new Voice ID will appear here after cloning.",
	elem_classes=["status-text"],
	)

	clone_btn.click(
	fn=clone_handler,
	inputs=[clone_audio, clone_url, clone_name, clone_gender, clone_langs],
	outputs=[clone_result, tts_voice_id],
	)

	# ── TAB 4: Sample ──────────────────────────────────────────────────
	with gr.TabItem("🎧 Sample", elem_classes=["tabs-container"]):
	gr.Markdown("""
	Preview the sample audio below.
	This is rendered as a native playable audio sample without affecting the existing app flow.
	""")
	gr.HTML(f"""
	<div class="sample-card">
	<h3>Playable Audio Sample</h3>
	<p>Loaded from: <a href="{SAMPLE_AUDIO_URL}" target="_blank" style="color:#06b6d4;">{SAMPLE_AUDIO_URL}</a></p>
	<div class="sample-audio-wrap">
	<audio controls preload="metadata">
	<source src="{SAMPLE_AUDIO_URL}" type="audio/mpeg">
	Your browser does not support the audio element.
	</audio>
	</div>
	</div>
	""")

	gr.HTML("""
	<div style="text-align:center; padding: 1.5rem; color: #475569; font-size: 0.85rem;">
	Built for <a href="https://eburon.ai" target="_blank" style="color:#06b6d4;">Eburon</a>
	· Powered by your existing audio backend
	· <a href="https://echo.eburon.ai" target="_blank" style="color:#8b5cf6;">Echo Space</a>
	</div>
	""")


	if __name__ == "__main__":
	demo.launch(server_name="0.0.0.0", server_port=7860, ssr_mode=False)