Spaces:
Sleeping
Sleeping
| import os | |
| import base64 | |
| import tempfile | |
| import gradio as gr | |
| from pathlib import Path | |
| import requests | |
| from mistralai.client import Mistral | |
| def list_user_voices(): | |
| # Keep the textual list for debugging if needed, or remove it. Let's keep it but improve it. | |
| try: | |
| client = get_client() | |
| result = client.audio.voices.list(limit=100, offset=0) | |
| if result.total == 0: | |
| return "No voices found in your account." | |
| out = f"**Total Voices:** {result.total}\n\n" | |
| for voice in result.items: | |
| out += f"- **{voice.name}**\n - ID: `{voice.id}`\n - Languages: {', '.join(voice.languages) if hasattr(voice, 'languages') else 'unknown'}\n" | |
| return out | |
| except Exception as e: | |
| return f"Error fetching voices: {str(e)}" | |
| def get_voice_choices(): | |
| try: | |
| client = get_client() | |
| res = client.audio.voices.list(limit=100, offset=0) | |
| # Filter for Official Mistral Voices (Paul, Oliver, Jane, Marie) so we hide randomly cloned user voices | |
| official_names = ("Paul", "Oliver", "Jane", "Marie") | |
| official = [] | |
| for v in res.items: | |
| if v.name.startswith(official_names) and " - " in v.name: | |
| official.append((f"{v.name}", v.id)) | |
| return official | |
| except: | |
| return [] | |
| # βββ Client βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def get_client(): | |
| api_key = os.environ.get("MISTRAL_API_KEY") | |
| if not api_key: | |
| raise gr.Error("MISTRAL_API_KEY secret is not set. Add it in Space Settings β Secrets.") | |
| return Mistral(api_key=api_key) | |
| # βββ STT ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def transcribe_audio(audio_path, language): | |
| """Convert audio file β text using Voxtral Mini Transcribe.""" | |
| if audio_path is None: | |
| return "β οΈ Please record or upload an audio file first." | |
| try: | |
| client = get_client() | |
| lang_param = language if language != "Auto-detect" else None | |
| with open(audio_path, "rb") as f: | |
| kwargs = dict( | |
| model="voxtral-mini-latest", | |
| file={"content": f, "file_name": Path(audio_path).name}, | |
| ) | |
| if lang_param: | |
| kwargs["language"] = lang_param | |
| response = client.audio.transcriptions.complete(**kwargs) | |
| return response.text | |
| except Exception as e: | |
| return f"β Error: {str(e)}" | |
| # βββ TTS ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| BUILTIN_VOICES = { | |
| "Default (no voice clone)": None, | |
| } | |
| def synthesize_speech(text, voice_id_input, ref_audio_path, audio_format): | |
| """Convert text β speech using Voxtral Mini TTS.""" | |
| if not text.strip(): | |
| return None, "β οΈ Please enter some text." | |
| try: | |
| client = get_client() | |
| voice_id = voice_id_input.strip() if voice_id_input and voice_id_input.strip() else None | |
| kwargs = dict( | |
| model="voxtral-mini-tts-2603", | |
| input=text, | |
| response_format=audio_format, | |
| ) | |
| if voice_id: | |
| kwargs["voice_id"] = voice_id | |
| # Add Reference Audio for Zero-shot tone/voice cloning | |
| if ref_audio_path: | |
| with open(ref_audio_path, "rb") as f: | |
| ref_audio_b64 = base64.b64encode(f.read()).decode("utf-8") | |
| kwargs["ref_audio"] = ref_audio_b64 | |
| if not voice_id and not ref_audio_path: | |
| raise gr.Error("Mistral API requires a voice! Please either upload a short 'Reference Audio' clip to define the voice tone zero-shot, OR paste a valid Voice ID you cloned in the Voice Cloning tab. There are no built-in standard voices.") | |
| response = client.audio.speech.complete(**kwargs) | |
| audio_bytes = base64.b64decode(response.audio_data) | |
| # Write to temp file | |
| suffix = f".{audio_format}" | |
| tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix) | |
| tmp.write(audio_bytes) | |
| tmp.close() | |
| return tmp.name, f"β Generated {len(audio_bytes):,} bytes of {audio_format.upper()} audio." | |
| except Exception as e: | |
| return None, f"β Error: {str(e)}" | |
| # βββ Voice Cloning ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def clone_voice(audio_path, url_input, voice_name, gender, languages_str): | |
| """Upload a sample audio or provide a URL to create a reusable cloned voice.""" | |
| if not audio_path and not url_input.strip(): | |
| return "β οΈ Please upload an audio clip or provide a media URL.", gr.update() | |
| if not voice_name.strip(): | |
| return "β οΈ Please enter a name for the voice.", gr.update() | |
| final_audio_path = audio_path | |
| try: | |
| # If URL is provided, handle direct links or yt-dlp | |
| if url_input.strip(): | |
| url = url_input.strip() | |
| base_out = tempfile.mktemp() | |
| # If it's a direct audio file link, bypass yt-dlp and download it directly | |
| if url.lower().endswith(('.mp3', '.wav', '.flac', '.ogg', '.m4a')): | |
| try: | |
| ext = url.split('.')[-1] | |
| final_audio_path = f"{base_out}.{ext}" | |
| with requests.get(url, stream=True, timeout=15) as r: | |
| r.raise_for_status() | |
| with open(final_audio_path, 'wb') as f: | |
| for chunk in r.iter_content(chunk_size=8192): | |
| f.write(chunk) | |
| except Exception as e: | |
| return f"β Error downloading direct audio link: {str(e)}", gr.update() | |
| # Otherwise use yt-dlp for TikTok, Twitter, YouTube (if not blocked), etc. | |
| else: | |
| import yt_dlp | |
| ydl_opts = { | |
| 'format': 'bestaudio/best', | |
| 'outtmpl': base_out + '.%(ext)s', | |
| 'quiet': True, | |
| 'postprocessors': [{ | |
| 'key': 'FFmpegExtractAudio', | |
| 'preferredcodec': 'mp3', | |
| 'preferredquality': '128', | |
| }], | |
| 'postprocessor_args': [ | |
| '-t', '60' # Limit to first 60 seconds | |
| ], | |
| } | |
| try: | |
| with yt_dlp.YoutubeDL(ydl_opts) as ydl: | |
| info = ydl.extract_info(url, download=True) | |
| final_audio_path = base_out + '.mp3' | |
| except Exception as e: | |
| err_msg = str(e) | |
| if "Sign in to confirm" in err_msg or "bot" in err_msg.lower() or "youtube" in err_msg.lower(): | |
| raise gr.Error("YouTube blocked the Hugging Face Server. Please use a TikTok/Twitter link, OR paste a direct .MP3 URL, OR upload the file manually.") | |
| else: | |
| raise gr.Error(f"Video download failed: {err_msg}") | |
| client = get_client() | |
| sample_b64 = base64.b64encode(Path(final_audio_path).read_bytes()).decode() | |
| langs = [l.strip() for l in languages_str.split(",") if l.strip()] or ["en"] | |
| voice = client.audio.voices.create( | |
| name=voice_name.strip(), | |
| sample_audio=sample_b64, | |
| sample_filename=Path(final_audio_path).name, | |
| languages=langs, | |
| gender=gender.lower(), | |
| ) | |
| # Clean up downloaded file | |
| if url_input.strip() and os.path.exists(final_audio_path): | |
| try: os.remove(final_audio_path) | |
| except: pass | |
| # Build new choices specifically for this user session: Official Voices + Their new clone | |
| new_session_choices = get_voice_choices() + [(f"{voice.name} (Custom Session Clone)", voice.id)] | |
| return ( | |
| f"β Voice created!\n\n**Voice ID:** `{voice.id}`\n**Name:** {voice.name}\n**Languages:** {', '.join(voice.languages)}\n\nThis voice has been automatically selected in the Text-to-Speech tab!", | |
| gr.update(choices=new_session_choices, value=voice.id) | |
| ) | |
| except Exception as e: | |
| return f"β Error: {str(e)}", gr.update() | |
| # βββ UI βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| LANGUAGES = [ | |
| "Auto-detect", "en", "fr", "es", "de", "it", "pt", | |
| "zh", "ja", "ko", "ar", "ru", "hi", "nl" | |
| ] | |
| css = """ | |
| @import url('https://fonts.googleapis.com/css2?family=Outfit:wght@300;400;500;600;700;800&display=swap'); | |
| * { font-family: 'Outfit', sans-serif; } | |
| body, .gradio-container { | |
| background: radial-gradient(circle at 10% 20%, #150f24 0%, #07040d 100%) !important; | |
| min-height: 100vh; | |
| } | |
| .gradio-container { | |
| max-width: 1050px !important; | |
| margin: 0 auto !important; | |
| } | |
| /* App Header */ | |
| .app-header { | |
| text-align: center; | |
| padding: 3.5rem 1rem 1.5rem; | |
| position: relative; | |
| z-index: 10; | |
| } | |
| .app-header h1 { | |
| font-size: 3.2rem; | |
| font-weight: 800; | |
| letter-spacing: -1.5px; | |
| background: linear-gradient(135deg, #c084fc 0%, #ec4899 50%, #facc15 100%); | |
| -webkit-background-clip: text; | |
| -webkit-text-fill-color: transparent; | |
| background-clip: text; | |
| margin-bottom: 0.5rem; | |
| animation: glow-pulse 3s infinite alternate; | |
| } | |
| .app-header p { | |
| color: #94a3b8; | |
| font-size: 1.25rem; | |
| font-weight: 500; | |
| margin-top: 0; | |
| } | |
| .highlight-badge { | |
| background: linear-gradient(135deg, #f59e0b, #ef4444); | |
| color: white; | |
| padding: 2px 8px; | |
| border-radius: 8px; | |
| font-size: 0.8rem; | |
| font-weight: 800; | |
| vertical-align: top; | |
| margin-left: 10px; | |
| box-shadow: 0 0 10px rgba(239, 68, 68, 0.6); | |
| } | |
| /* Glass panel wrapper */ | |
| div.tabs-container, .panel-box { | |
| background: rgba(255, 255, 255, 0.02) !important; | |
| border: 1px solid rgba(255, 255, 255, 0.05) !important; | |
| border-radius: 20px !important; | |
| box-shadow: 0 10px 40px 0 rgba(0, 0, 0, 0.4) !important; | |
| backdrop-filter: blur(15px) !important; | |
| -webkit-backdrop-filter: blur(15px) !important; | |
| overflow: hidden; | |
| } | |
| /* Tabs */ | |
| .tab-nav { | |
| border-bottom: 1px solid rgba(255,255,255,0.05) !important; | |
| padding: 10px 10px 0 10px !important; | |
| } | |
| .tab-nav button { | |
| background: transparent !important; | |
| border: none !important; | |
| border-bottom: 3px solid transparent !important; | |
| color: #64748b !important; | |
| border-radius: 0 !important; | |
| margin: 0 !important; | |
| padding: 1rem 2rem !important; | |
| font-weight: 600 !important; | |
| font-size: 1.05rem !important; | |
| transition: all 0.3s ease !important; | |
| box-shadow: none !important; | |
| } | |
| .tab-nav button.selected, .tab-nav button:hover { | |
| color: #f8fafc !important; | |
| border-bottom: 3px solid #ec4899 !important; | |
| box-shadow: 0 20px 20px -20px rgba(236,72,153,0.3) !important; | |
| background: linear-gradient(0deg, rgba(236,72,153,0.1) 0%, transparent 100%) !important; | |
| } | |
| /* Inputs & Textareas */ | |
| textarea, input[type="text"], .dropdown-menu { | |
| background: rgba(0,0,0,0.25) !important; | |
| border: 1px solid rgba(255,255,255,0.08) !important; | |
| border-radius: 14px !important; | |
| color: #f8fafc !important; | |
| font-size: 1.05rem !important; | |
| transition: all 0.2s ease !important; | |
| padding: 0.75rem !important; | |
| } | |
| textarea:focus, input[type="text"]:focus { | |
| border-color: #ec4899 !important; | |
| box-shadow: 0 0 0 3px rgba(236,72,153,0.2) !important; | |
| background: rgba(0,0,0,0.4) !important; | |
| } | |
| /* Override Gradio layout borders */ | |
| div.form { | |
| border: none !important; | |
| box-shadow: none !important; | |
| background: transparent !important; | |
| } | |
| /* Cool gradient buttons */ | |
| button.primary { | |
| background: linear-gradient(135deg, #a78bfa 0%, #ec4899 100%) !important; | |
| border: none !important; | |
| color: white !important; | |
| border-radius: 14px !important; | |
| font-weight: 700 !important; | |
| font-size: 1.15rem !important; | |
| padding: 0.9rem !important; | |
| letter-spacing: 0.5px !important; | |
| box-shadow: 0 4px 15px rgba(236,72,153,0.3) !important; | |
| transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1) !important; | |
| } | |
| button.primary:hover { | |
| transform: translateY(-3px) !important; | |
| box-shadow: 0 8px 25px rgba(236,72,153,0.5) !important; | |
| } | |
| /* Secondary Button */ | |
| button.secondary { | |
| background: rgba(255,255,255,0.05) !important; | |
| border: 1px solid rgba(255,255,255,0.1) !important; | |
| border-radius: 14px !important; | |
| color: #e2e8f0 !important; | |
| transition: all 0.2s ease !important; | |
| font-weight: 600 !important; | |
| } | |
| button.secondary:hover { | |
| background: rgba(255,255,255,0.15) !important; | |
| border-color: rgba(255,255,255,0.3) !important; | |
| } | |
| /* Status text box */ | |
| .status-text { | |
| background: rgba(0,0,0,0.4); | |
| padding: 1.5rem; | |
| border-radius: 16px; | |
| border-left: 5px solid #a78bfa; | |
| color: #e2e8f0; | |
| font-size: 1rem; | |
| line-height: 1.6; | |
| } | |
| /* Highlight labels */ | |
| label span { | |
| color: #cbd5e1 !important; | |
| font-weight: 500 !important; | |
| letter-spacing: 0.2px !important; | |
| } | |
| /* Clean audio components */ | |
| .audio-component { | |
| border-radius: 16px !important; | |
| overflow: hidden !important; | |
| border: 1px solid rgba(255,255,255,0.05) !important; | |
| } | |
| /* Global Animations */ | |
| @keyframes glow-pulse { | |
| 0% { filter: drop-shadow(0 0 15px rgba(167,139,250,0.3)); } | |
| 100% { filter: drop-shadow(0 0 30px rgba(236,72,153,0.6)); } | |
| } | |
| /* Footer Hide */ | |
| footer { display: none !important; } | |
| """ | |
| INITIAL_VOICES = get_voice_choices() | |
| with gr.Blocks(title="Voxtral Studio β Mistral AI Audio", css=css) as demo: | |
| gr.HTML(""" | |
| <div class="app-header"> | |
| <h1>ποΈ Voxtral Studio <span class="highlight-badge">VOICE CLONING</span></h1> | |
| <p>Powered by Mistral AI Β· STT & Elite Text-to-Speech + Instant Zero-Shot Cloning</p> | |
| </div> | |
| """) | |
| with gr.Tabs(): | |
| # ββ TAB 1: Speech to Text ββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.TabItem("π€ Speech β Text"): | |
| gr.Markdown(""" | |
| **Upload or record audio** and Voxtral Mini will transcribe it with high accuracy. | |
| Supports 13 languages, handles noise, and can detect the language automatically. | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| stt_audio = gr.Audio( | |
| label="Audio Input", | |
| sources=["microphone", "upload"], | |
| type="filepath", | |
| elem_classes=["audio-component"], | |
| ) | |
| stt_language = gr.Dropdown( | |
| choices=LANGUAGES, | |
| value="Auto-detect", | |
| label="Language", | |
| ) | |
| stt_btn = gr.Button("β¨ Transcribe", variant="primary") | |
| with gr.Column(scale=1): | |
| stt_output = gr.Textbox( | |
| label="Transcription", | |
| lines=12, | |
| placeholder="Your transcribed text will appear here...", | |
| ) | |
| stt_btn.click( | |
| fn=transcribe_audio, | |
| inputs=[stt_audio, stt_language], | |
| outputs=stt_output, | |
| ) | |
| # ββ TAB 2: Text to Speech ββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.TabItem("π Text β Speech", elem_classes=["tabs-container"]): | |
| gr.Markdown(""" | |
| **Type text** and Voxtral Mini TTS converts it to natural speech. | |
| Optionally paste a **Voice ID** from the Voice Cloning tab to use your own cloned voice. | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| tts_text = gr.Textbox( | |
| label="Text to speak", | |
| lines=8, | |
| placeholder="Enter text here (max ~300 words for best results). Avoid markdown or special characters.", | |
| value="Hello! Welcome to Voxtral Studio, powered by Mistral AI. This is a demonstration of high-quality neural text-to-speech synthesis.", | |
| ) | |
| with gr.Row(): | |
| tts_voice_id = gr.Dropdown( | |
| label="Select a Mistral Voice or Your Clones", | |
| choices=INITIAL_VOICES, | |
| value=INITIAL_VOICES[0][1] if INITIAL_VOICES else None, | |
| allow_custom_value=True, | |
| scale=3, | |
| ) | |
| voices_btn = gr.Button("π Refresh List", size="sm", scale=1) | |
| voices_list_out = gr.Markdown(visible=False) # Hide text list since we use dropdown now | |
| tts_ref_audio = gr.Audio( | |
| label="OR: Reference Audio (Set voice tone instantly)", | |
| sources=["upload", "microphone"], | |
| type="filepath", | |
| ) | |
| tts_format = gr.Dropdown( | |
| choices=["mp3", "wav", "flac", "opus"], | |
| value="mp3", | |
| label="Audio Format", | |
| ) | |
| tts_btn = gr.Button("π΅ Generate Speech", variant="primary") | |
| with gr.Column(scale=1): | |
| tts_audio_out = gr.Audio( | |
| label="Generated Audio", | |
| type="filepath", | |
| elem_classes=["audio-component"], | |
| ) | |
| tts_status = gr.Markdown(elem_classes=["status-text"]) | |
| tts_btn.click( | |
| fn=synthesize_speech, | |
| inputs=[tts_text, tts_voice_id, tts_ref_audio, tts_format], | |
| outputs=[tts_audio_out, tts_status], | |
| ) | |
| voices_btn.click( | |
| fn=lambda: gr.update(choices=get_voice_choices()), | |
| inputs=[], | |
| outputs=tts_voice_id, | |
| ) | |
| # ββ TAB 3: Voice Cloning βββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.TabItem("𧬠Voice Cloning", elem_classes=["tabs-container"]): | |
| gr.Markdown(""" | |
| **Clone any voice** by uploading a short audio sample (10β60 seconds recommended). | |
| The model will save it as a reusable voice. Copy the Voice ID and paste it in the TTS tab. | |
| > β οΈ Only clone voices with **explicit consent**. Do not impersonate real people. | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| clone_audio = gr.Audio( | |
| label="Voice Sample (upload or record)", | |
| sources=["microphone", "upload"], | |
| type="filepath", | |
| elem_classes=["audio-component"], | |
| ) | |
| clone_url = gr.Textbox( | |
| label="OR: Media URL (TikTok, Twitter, or direct .MP3/.WAV link)", | |
| placeholder="https://...link_to_audio_or_video...", | |
| ) | |
| clone_name = gr.Textbox( | |
| label="Voice Name", | |
| placeholder="e.g. my-assistant-voice", | |
| ) | |
| clone_gender = gr.Dropdown( | |
| choices=["Female", "Male"], | |
| value="Female", | |
| label="Gender", | |
| ) | |
| clone_langs = gr.Textbox( | |
| label="Languages (comma-separated)", | |
| value="en", | |
| placeholder="en, fr, es", | |
| ) | |
| clone_btn = gr.Button("𧬠Clone Voice", variant="primary") | |
| with gr.Column(scale=1): | |
| clone_result = gr.Markdown( | |
| value="Your new Voice ID will appear here after cloning.", | |
| elem_classes=["status-text"], | |
| ) | |
| clone_btn.click( | |
| fn=clone_voice, | |
| inputs=[clone_audio, clone_url, clone_name, clone_gender, clone_langs], | |
| outputs=[clone_result, tts_voice_id], | |
| ) | |
| gr.HTML(""" | |
| <div style="text-align:center; padding: 1.5rem; color: #475569; font-size: 0.85rem;"> | |
| Built with <a href="https://docs.mistral.ai/capabilities/audio/" target="_blank" style="color:#a78bfa;">Mistral Voxtral</a> | |
| Β· <a href="https://huggingface.co/" target="_blank" style="color:#60a5fa;">Hugging Face Spaces</a> | |
| </div> | |
| """) | |
| if __name__ == "__main__": | |
| demo.launch(server_name="0.0.0.0", server_port=7860) | |