voxtral-studio

Sleeping

File size: 17,462 Bytes

5d0a7e7
 
ceefeb6
7a3ed03
242da97
 
 
 
5d0a7e7
 
242da97
 
 
 
 
 
799eb1a
24b186c
 
 
242da97
 
 
5d0a7e7
 
242da97
5d0a7e7
 
 
242da97
5d0a7e7
 
242da97
 
5d0a7e7
 
 
242da97
5d0a7e7
242da97
973f53e
 
a2bc31b
 
973f53e
a2bc31b
5d0a7e7
242da97
 
 
 
5d0a7e7
 
 
 
 
 
 
 
 
ad19b2a
5d0a7e7
ad19b2a
5d0a7e7
 
520603b
5d0a7e7
 
 
 
ad19b2a
5d0a7e7
 
 
ad19b2a
5d0a7e7
 
ad19b2a
 
 
5d0a7e7
 
559f05c
ad19b2a
 
520603b
5d0a7e7
 
 
ad19b2a
 
5d0a7e7
 
 
559f05c
 
5d0a7e7
 
559f05c
520603b
f570a03
 
 
 
 
559f05c
f570a03
559f05c
 
 
520603b
f570a03
 
559f05c
5d0a7e7
ad19b2a
 
 
 
 
 
56515c2
ad19b2a
 
5d0a7e7
ad19b2a
 
 
 
5d0a7e7
ad19b2a
 
 
 
 
 
 
 
 
 
 
5d0a7e7
 
ad19b2a
520603b
 
 
5d0a7e7
 
ad19b2a
 
 
 
 
 
 
520603b
5d0a7e7
520603b
5d0a7e7
 
ad19b2a
 
 
 
 
520603b
ad19b2a
5d0a7e7
 
ad19b2a
520603b
ad19b2a
 
 
 
 
 
 
 
 
 
 
 
 
 
5d0a7e7
 
ad19b2a
5d0a7e7
ad19b2a
 
 
520603b
ad19b2a
 
 
5d0a7e7
 
ad19b2a
5d0a7e7
 
 
ad19b2a
5d0a7e7
7a3ed03
ad19b2a
5d0a7e7
ad19b2a
5d0a7e7
ad19b2a
 
 
24b186c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ad19b2a
 
520603b
 
5d0a7e7
ad19b2a
 
 
5d0a7e7
 
405e984
a2bc31b
520603b
5d0a7e7
 
 
520603b
 
9ac4f2a
520603b
 
 
9ac4f2a
 
 
5d0a7e7
 
 
 
 
 
83e26c3
5d0a7e7
520603b
 
5d0a7e7
7a3ed03
5d0a7e7
 
 
 
 
 
7a3ed03
5d0a7e7
 
 
 
 
 
7a3ed03
5d0a7e7
 
 
 
 
7a3ed03
5d0a7e7
 
242da97
5d0a7e7
 
 
 
 
ad19b2a
5d0a7e7
520603b
5d0a7e7
 
 
 
 
 
 
 
023599d
5d0a7e7
799eb1a
a2bc31b
520603b
405e984
 
a2bc31b
799eb1a
 
a2bc31b
520603b
 
 
a478940
 
 
 
5d0a7e7
 
 
 
 
 
 
 
 
 
 
 
 
7a3ed03
5d0a7e7
 
 
242da97
a478940
5d0a7e7
7a3ed03
799eb1a
a2bc31b
799eb1a
a2bc31b
799eb1a
7a3ed03
5d0a7e7
ad19b2a
5d0a7e7
 
520603b
5d0a7e7
520603b
5d0a7e7
 
 
 
 
 
 
 
 
3cda682
559f05c
 
3cda682
5d0a7e7
 
520603b
5d0a7e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
242da97
3cda682
a2bc31b
7a3ed03
 
24b186c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5d0a7e7
 
520603b
 
24b186c
5d0a7e7
 
 
 
 
520603b

import os
import base64
import tempfile
import gradio as gr
import base64
import os
import tempfile
import gradio as gr
from pathlib import Path

from core import (
    get_voice_choices,
    transcribe_audio as core_transcribe,
    synthesize_speech as core_synthesize,
    clone_voice as core_clone
)

# ─── Constants ────────────────────────────────────────────────────────────────
SAMPLE_AUDIO_URL = "https://eburon.ai/sample/sample1.mp3"

# ─── Gradio App Wrappers ──────────────────────────────────────────────────────
def transcribe_handler(audio_path, language):
    if not audio_path:
        return "⚠️ Please record or upload an audio file first."
    try:
        return core_transcribe(audio_path, language)
    except Exception as e:
        return f"❌ Error: {str(e)}"

def synthesize_handler(text, voice_id_input, ref_audio_path, audio_format):
    try:
        voice_id = voice_id_input.strip() if voice_id_input and voice_id_input.strip() else None
        output_path, num_bytes = core_synthesize(text, voice_id, ref_audio_path, audio_format)
        return output_path, f"✅ Generated {num_bytes:,} bytes of {audio_format.upper()} audio."
    except Exception as e:
        return None, f"❌ Error: {str(e)}"

def clone_handler(audio_path, url_input, voice_name, gender, languages_str):
    try:
        voice = core_clone(audio_path, url_input, voice_name, gender, languages_str)
        # Build new choices specifically for this user session: Official Voices + Their new clone
        new_session_choices = get_voice_choices() + [(f"{voice.name} (Custom Session Clone)", voice.id)]
        return (
            f"✅ Voice created!\n\n**Voice ID:** `{voice.id}`\n**Name:** {voice.name}\n**Languages:** {', '.join(voice.languages)}\n\nThis voice has been automatically selected in the Text-to-Speech tab!",
            gr.update(choices=new_session_choices, value=voice.id)
        )
    except Exception as e:
        err_msg = str(e)
        if "Sign in to confirm" in err_msg or "bot" in err_msg.lower() or "youtube" in err_msg.lower():
            return "❌ YouTube blocked the proxy crawler. Please use a TikTok/Twitter link, OR paste a direct .MP3 URL, OR upload the file manually.", gr.update()
        return f"❌ Error: {err_msg}", gr.update()


# ─── UI ───────────────────────────────────────────────────────────────────────
LANGUAGES = [
    "Auto-detect", "en", "fr", "es", "de", "it", "pt",
    "zh", "ja", "ko", "ar", "ru", "hi", "nl"
]

css = """
@import url('https://fonts.googleapis.com/css2?family=Outfit:wght@300;400;500;600;700;800&display=swap');

* { font-family: 'Outfit', sans-serif; }

body, .gradio-container {
    background: radial-gradient(circle at 10% 20%, #120d22 0%, #05030a 100%) !important;
    min-height: 100vh;
}

.gradio-container {
    max-width: 1050px !important;
    margin: 0 auto !important;
}

/* App Header */
.app-header {
    text-align: center;
    padding: 3.5rem 1rem 1.5rem;
    position: relative;
    z-index: 10;
}
.app-header h1 {
    font-size: 3.2rem;
    font-weight: 800;
    letter-spacing: -1.5px;
    background: linear-gradient(135deg, #8b5cf6 0%, #06b6d4 50%, #f59e0b 100%);
    -webkit-background-clip: text;
    -webkit-text-fill-color: transparent;
    background-clip: text;
    margin-bottom: 0.5rem;
    animation: glow-pulse 3s infinite alternate;
}
.app-header p {
    color: #94a3b8;
    font-size: 1.25rem;
    font-weight: 500;
    margin-top: 0;
}
.highlight-badge {
    background: linear-gradient(135deg, #06b6d4, #8b5cf6) !important;
    -webkit-background-clip: border-box !important;
    background-clip: border-box !important;
    -webkit-text-fill-color: white !important;
    color: white !important;
    padding: 4px 10px;
    border-radius: 8px;
    font-size: 0.9rem;
    font-weight: 800;
    vertical-align: top;
    margin-left: 10px;
    box-shadow: 0 0 15px rgba(139, 92, 246, 0.45);
    display: inline-block;
    letter-spacing: 0.5px;
}

/* Glass panel wrapper */
div.tabs-container, .panel-box {
    background: rgba(255, 255, 255, 0.02) !important;
    border: 1px solid rgba(255, 255, 255, 0.05) !important;
    border-radius: 20px !important;
    box-shadow: 0 10px 40px 0 rgba(0, 0, 0, 0.4) !important;
    overflow: visible !important;
}

/* Tabs */
.tab-nav {
    border-bottom: 1px solid rgba(255,255,255,0.05) !important;
    padding: 10px 10px 0 10px !important;
}
.tab-nav button {
    background: transparent !important;
    border: none !important;
    border-bottom: 3px solid transparent !important;
    color: #64748b !important;
    border-radius: 0 !important;
    margin: 0 !important;
    padding: 1rem 2rem !important;
    font-weight: 600 !important;
    font-size: 1.05rem !important;
    transition: all 0.3s ease !important;
    box-shadow: none !important;
}
.tab-nav button.selected, .tab-nav button:hover {
    color: #f8fafc !important;
    border-bottom: 3px solid #06b6d4 !important;
    box-shadow: 0 20px 20px -20px rgba(6,182,212,0.30) !important;
    background: linear-gradient(0deg, rgba(6,182,212,0.10) 0%, transparent 100%) !important;
}

/* Override Gradio layout borders */
div.form {
    border: none !important;
    box-shadow: none !important;
    background: transparent !important;
}

/* Primary Buttons */
button.primary {
    background: linear-gradient(135deg, #8b5cf6 0%, #06b6d4 100%) !important;
    border: none !important;
    color: white !important;
    border-radius: 14px !important;
    font-weight: 700 !important;
    font-size: 1.15rem !important;
    padding: 0.9rem !important;
    letter-spacing: 0.5px !important;
    box-shadow: 0 4px 15px rgba(6,182,212,0.25) !important;
    transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1) !important;
}
button.primary:hover {
    transform: translateY(-3px) !important;
    box-shadow: 0 8px 25px rgba(6,182,212,0.40) !important;
}

/* Secondary Button */
button.secondary {
    background: rgba(255,255,255,0.05) !important;
    border: 1px solid rgba(255,255,255,0.1) !important;
    border-radius: 14px !important;
    color: #e2e8f0 !important;
    transition: all 0.2s ease !important;
    font-weight: 600 !important;
}
button.secondary:hover {
    background: rgba(255,255,255,0.15) !important;
    border-color: rgba(255,255,255,0.3) !important;
}

/* Status text box */
.status-text {
    background: rgba(0,0,0,0.4);
    padding: 1.5rem;
    border-radius: 16px;
    border-left: 5px solid #06b6d4;
    color: #e2e8f0;
    font-size: 1rem;
    line-height: 1.6;
}

/* Highlight labels */
label span {
    color: #cbd5e1 !important;
    font-weight: 500 !important;
    letter-spacing: 0.2px !important;
}

/* Clean audio components */
.audio-component {
    border-radius: 16px !important;
    overflow: hidden !important;
    border: 1px solid rgba(255,255,255,0.05) !important;
}

/* Sample tab */
.sample-card {
    background: rgba(255,255,255,0.03);
    border: 1px solid rgba(255,255,255,0.08);
    border-radius: 18px;
    padding: 1.5rem;
    box-shadow: 0 10px 30px rgba(0,0,0,0.25);
}
.sample-card h3 {
    margin-top: 0;
    color: #f8fafc;
    font-size: 1.2rem;
    font-weight: 700;
}
.sample-card p {
    color: #94a3b8;
    margin-bottom: 1rem;
}
.sample-audio-wrap {
    background: rgba(0,0,0,0.35);
    border: 1px solid rgba(255,255,255,0.06);
    border-radius: 16px;
    padding: 1rem;
}
.sample-audio-wrap audio {
    width: 100%;
    outline: none;
    border-radius: 12px;
}

/* Global Animations */
@keyframes glow-pulse {
    0% { filter: drop-shadow(0 0 15px rgba(139, 92, 246, 0.25)); }
    100% { filter: drop-shadow(0 0 30px rgba(6, 182, 212, 0.45)); }
}

/* Footer Hide */
footer { display: none !important; }
"""

INITIAL_VOICES = get_voice_choices()

with gr.Blocks(title="Eburon Voice Studio", css=css) as demo:

    gr.HTML("""
    <div class="app-header">
        <h1>🎙️ Eburon Voice Studio <span class="highlight-badge">VOICE LAB</span></h1>
        <p>Powered by Eburon Audio · Speech-to-Text, Text-to-Speech, and Instant Voice Cloning</p>
        <div style="margin-top: 15px;">
            <a href="https://eburon.ai" target="_blank" style="text-decoration: none;">
                <span style="background: linear-gradient(135deg, #8b5cf6, #06b6d4); color: white; padding: 6px 14px; border-radius: 20px; font-weight: bold; font-size: 0.9rem; box-shadow: 0 4px 15px rgba(6, 182, 212, 0.35); display: inline-block; cursor: pointer; transition: transform 0.2s;">
                    ✨ Visit Eburon
                </span>
            </a>
        </div>
    </div>
    """)

    with gr.Tabs():

        # ── TAB 1: Speech to Text ──────────────────────────────────────────
        with gr.TabItem("🎤 Speech → Text", elem_classes=["tabs-container"]):
            gr.Markdown("""
            **Upload or record audio** and Eburon Audio will transcribe it with high accuracy.
            Supports multiple languages, handles noisy inputs, and can detect the language automatically.
            """)
            with gr.Row():
                with gr.Column(scale=1):
                    stt_audio = gr.Audio(
                        label="Audio Input",
                        sources=["microphone", "upload"],
                        type="filepath",
                        elem_classes=["audio-component"],
                    )
                    stt_language = gr.Dropdown(
                        choices=LANGUAGES,
                        value="Auto-detect",
                        label="Language",
                    )
                    stt_btn = gr.Button("✨ Transcribe", variant="primary")

                with gr.Column(scale=1):
                    stt_output = gr.Textbox(
                        label="Transcription",
                        lines=12,
                        placeholder="Your transcribed text will appear here...",
                    )

            stt_btn.click(
                fn=transcribe_handler,
                inputs=[stt_audio, stt_language],
                outputs=stt_output,
            )

        # ── TAB 2: Text to Speech ──────────────────────────────────────────
        with gr.TabItem("🔊 Text → Speech", elem_classes=["tabs-container"]):
            gr.Markdown("""
            **Type text** and Eburon Audio converts it into natural speech.
            Optionally paste a **Voice ID** from the Voice Cloning tab to use your own cloned voice.
            """)
            with gr.Row():
                with gr.Column(scale=1):
                    tts_text = gr.Textbox(
                        label="Text to speak",
                        lines=8,
                        placeholder="Enter text here (max ~300 words for best results). Avoid markdown or special characters.",
                        value="And that is what makes this moment so important.\n\nBecause we are no longer in the phase where AI is only a spectacle.\nWe are entering the phase where it must become dependable.\nIntegrated.\nEfficient.\nAnd truly beneficial.\n\nThe projects that matter now will be the ones that combine vision with grounded execution.\nThe ones that understand cost as well as capability.\nLatency as well as intelligence.\nHuman need as well as model performance.",
                    )
                    with gr.Row():
                        tts_voice_id = gr.Dropdown(
                            label="Select a Voice or Your Session Clones",
                            choices=INITIAL_VOICES,
                            value=INITIAL_VOICES[0][1] if INITIAL_VOICES else None,
                            allow_custom_value=True,
                            scale=3,
                        )
                        voices_btn = gr.Button("🔄 Refresh List", size="sm", scale=1)

                    voices_list_out = gr.Markdown(visible=False)

                    tts_ref_audio = gr.Audio(
                        label="OR: Reference Audio (Set voice tone instantly)",
                        sources=["upload", "microphone"],
                        type="filepath",
                    )
                    tts_format = gr.Dropdown(
                        choices=["mp3", "wav", "flac", "opus"],
                        value="mp3",
                        label="Audio Format",
                    )
                    tts_btn = gr.Button("🎵 Generate Speech", variant="primary")

                with gr.Column(scale=1):
                    tts_audio_out = gr.Audio(
                        label="Generated Audio",
                        type="filepath",
                        elem_classes=["audio-component"],
                    )
                    tts_status = gr.Markdown(elem_classes=["status-text"])

            tts_btn.click(
                fn=synthesize_handler,
                inputs=[tts_text, tts_voice_id, tts_ref_audio, tts_format],
                outputs=[tts_audio_out, tts_status],
            )
            voices_btn.click(
                fn=lambda: gr.update(choices=get_voice_choices()),
                inputs=[],
                outputs=tts_voice_id,
            )

        # ── TAB 3: Voice Cloning ───────────────────────────────────────────
        with gr.TabItem("🧬 Voice Cloning", elem_classes=["tabs-container"]):
            gr.Markdown("""
            **Clone any voice** by uploading a short audio sample (10–60 seconds recommended).
            The system will save it as a reusable voice. Copy the Voice ID and paste it in the Text-to-Speech tab.

            > ⚠️ Only clone voices with explicit consent.
            """)
            with gr.Row():
                with gr.Column(scale=1):
                    clone_audio = gr.Audio(
                        label="Voice Sample (upload or record)",
                        sources=["microphone", "upload"],
                        type="filepath",
                        elem_classes=["audio-component"],
                    )
                    clone_url = gr.Textbox(
                        label="OR: Media URL (TikTok, Twitter, or direct .MP3/.WAV link)",
                        placeholder="https://...link_to_audio_or_video...",
                    )
                    clone_name = gr.Textbox(
                        label="Voice Name",
                        placeholder="e.g. eburon-assistant-voice",
                    )
                    clone_gender = gr.Dropdown(
                        choices=["Female", "Male"],
                        value="Female",
                        label="Gender",
                    )
                    clone_langs = gr.Textbox(
                        label="Languages (comma-separated)",
                        value="en",
                        placeholder="en, fr, es",
                    )
                    clone_btn = gr.Button("🧬 Clone Voice", variant="primary")

                with gr.Column(scale=1):
                    clone_result = gr.Markdown(
                        value="Your new Voice ID will appear here after cloning.",
                        elem_classes=["status-text"],
                    )

            clone_btn.click(
                fn=clone_handler,
                inputs=[clone_audio, clone_url, clone_name, clone_gender, clone_langs],
                outputs=[clone_result, tts_voice_id],
            )

        # ── TAB 4: Sample ──────────────────────────────────────────────────
        with gr.TabItem("🎧 Sample", elem_classes=["tabs-container"]):
            gr.Markdown("""
            **Preview the sample audio** below.
            This is rendered as a native playable audio sample without affecting the existing app flow.
            """)
            gr.HTML(f"""
            <div class="sample-card">
                <h3>Playable Audio Sample</h3>
                <p>Loaded from: <a href="{SAMPLE_AUDIO_URL}" target="_blank" style="color:#06b6d4;">{SAMPLE_AUDIO_URL}</a></p>
                <div class="sample-audio-wrap">
                    <audio controls preload="metadata">
                        <source src="{SAMPLE_AUDIO_URL}" type="audio/mpeg">
                        Your browser does not support the audio element.
                    </audio>
                </div>
            </div>
            """)

    gr.HTML("""
    <div style="text-align:center; padding: 1.5rem; color: #475569; font-size: 0.85rem;">
        Built for <a href="https://eburon.ai" target="_blank" style="color:#06b6d4;">Eburon</a>
        · Powered by your existing audio backend
        · <a href="https://echo.eburon.ai" target="_blank" style="color:#8b5cf6;">Echo Space</a>
    </div>
    """)


if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860, ssr_mode=False)