import os import html import shutil import subprocess import tempfile from typing import Optional import gradio as gr DESCRIPTION = """ Mimic 3 TTS on Hugging Face Spaces (Gradio) - Uses the Mimic 3 CLI under-the-hood and returns MP3 audio (falls back to WAV if conversion fails). - Leave the Voice Key blank to use the default voice, or provide a specific key (e.g., `en_US/cmu-arctic_low`). - You can optionally wrap the input in SSML for rate/pitch by toggling the advanced options. Note: The first run may download voice models and can take longer. """ def build_text(text: str, use_ssml: bool, rate: Optional[str], pitch: Optional[str]) -> str: text = text or "" if not use_ssml or (not rate and not pitch): return text # Wrap text with SSML prosody if adjustments were requested. # Supported values for rate/pitch follow SSML conventions, e.g. "85%", "+2st", "-10%" rate_attr = f' rate="{rate.strip()}"' if rate else "" pitch_attr = f' pitch="{pitch.strip()}"' if pitch else "" return f"{html.escape(text)}" def synthesize(text: str, voice_key: str, use_ssml: bool, rate: str, pitch: str): if not text or not text.strip(): return None input_text = build_text(text.strip(), use_ssml, rate, pitch) # Prepare the command cmd = ["mimic3"] if voice_key and voice_key.strip(): cmd += ["--voice", voice_key.strip()] if use_ssml: cmd += ["--ssml"] cmd += [input_text] try: # Run mimic3 and capture the WAV from stdout proc = subprocess.run( cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=False, ) if proc.returncode != 0: err = proc.stderr.decode(errors="ignore") raise gr.Error(f"Mimic 3 failed (code {proc.returncode}).\n\n{err}") # Write the WAV bytes to a temp file with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp: tmp.write(proc.stdout) wav_path = tmp.name ffmpeg_path = shutil.which("ffmpeg") if not ffmpeg_path: return wav_path mp3_fd, mp3_path = tempfile.mkstemp(suffix=".mp3") os.close(mp3_fd) convert = subprocess.run( [ffmpeg_path, "-y", "-i", wav_path, "-codec:a", "libmp3lame", "-qscale:a", "4", mp3_path], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=False, ) if convert.returncode == 0 and os.path.exists(mp3_path): os.remove(wav_path) return mp3_path # Conversion failed; clean up mp3 placeholder and return WAV instead if os.path.exists(mp3_path): os.remove(mp3_path) return wav_path except FileNotFoundError: # The mimic3 CLI was not found; show a helpful error in the UI raise gr.Error("mimic3 CLI not found. Ensure package 'mycroft-mimic3-tts' is installed and available in PATH.") except Exception as e: raise gr.Error(str(e)) def _parse_voices(output: str): # Returns (languages -> [voice_keys]) mapping = {} for line in output.splitlines(): line = line.strip() if not line: continue # Expect first token to be the voice key key = line.split()[0] if "/" in key: lang = key.split("/", 1)[0] else: # Fallback bucket lang = "other" mapping.setdefault(lang, []).append(key) # Sort voices for lang in mapping: mapping[lang].sort() return mapping def load_voices(): try: proc = subprocess.run(["mimic3", "--voices"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=False) if proc.returncode != 0: err = proc.stderr.decode(errors="ignore") raise gr.Error(f"Failed to list voices.\n\n{err}") mapping = _parse_voices(proc.stdout.decode(errors="ignore")) if not mapping: raise gr.Error("No voices found. Try again after models are available.") languages = sorted(mapping.keys()) default_lang = languages[0] voices = mapping[default_lang] # Build human-readable labels for languages while using codes as values def _lang_label(code: str) -> str: # code like en_US, ko_KR parts = code.split("_", 1) base = parts[0].lower() region = parts[1] if len(parts) > 1 else None names = { "en": "English", "ko": "Korean", "de": "German", "es": "Spanish", "fr": "French", "it": "Italian", "pt": "Portuguese", "nl": "Dutch", "sv": "Swedish", "no": "Norwegian", "da": "Danish", "fi": "Finnish", "pl": "Polish", "ru": "Russian", "tr": "Turkish", "ar": "Arabic", "hi": "Hindi", "ja": "Japanese", "zh": "Chinese", } base_name = names.get(base, code) return f"{base_name} ({region})" if region else base_name language_choices = [( _lang_label(code), code) for code in languages] # Return updates for dropdowns and the mapping state return ( gr.update(choices=language_choices, value=default_lang), gr.update(choices=voices, value=voices[0] if voices else None), mapping, ) except FileNotFoundError: raise gr.Error("mimic3 CLI not found. Ensure 'mycroft-mimic3-tts' is installed.") def on_language_change(lang: str, mapping: dict): voices = mapping.get(lang, []) if isinstance(mapping, dict) else [] return gr.update(choices=voices, value=(voices[0] if voices else None)) def filter_voices(search: str, lang: str, mapping: dict): voices = mapping.get(lang, []) if isinstance(mapping, dict) else [] if search: s = search.strip().lower() voices = [v for v in voices if s in v.lower()] return gr.update(choices=voices, value=(voices[0] if voices else None)) with gr.Blocks(title="Mimic 3 TTS") as demo: gr.Markdown(f"# Mimic 3 TTS\n{DESCRIPTION}") with gr.Row(): text = gr.Textbox(label="Text", placeholder="Type text to synthesize…", lines=4) with gr.Row(): language_dd = gr.Dropdown(label="Language", choices=[], interactive=True) voice_dd = gr.Dropdown(label="Voice", choices=[], interactive=True) with gr.Row(): voice_search = gr.Textbox(label="Voice Search (filters by language)", placeholder="Type to filter voices, e.g., 'ko' or 'female' if present in key") refresh_btn = gr.Button("Refresh Voices") with gr.Row(): custom_voice = gr.Textbox(label="Custom Voice Key (optional)", placeholder="Overrides Voice dropdown if provided") voices_state = gr.State({}) with gr.Accordion("Advanced (SSML)", open=False): use_ssml = gr.Checkbox(label="Use SSML prosody for rate/pitch", value=False) with gr.Row(): rate = gr.Textbox(label="Rate (e.g., 85%, 110%)", placeholder="Optional") pitch = gr.Textbox(label="Pitch (e.g., +2st, -2st)", placeholder="Optional") with gr.Row(): btn = gr.Button("Synthesize", variant="primary") audio = gr.Audio(label="Output Audio", type="filepath") # Load voices at app start demo.load( fn=load_voices, inputs=None, outputs=[language_dd, voice_dd, voices_state], ) # Change voices when language changes language_dd.change( fn=on_language_change, inputs=[language_dd, voices_state], outputs=[voice_dd], ) # Filter voices as user types voice_search.change( fn=filter_voices, inputs=[voice_search, language_dd, voices_state], outputs=[voice_dd], ) # Refresh voices list from CLI refresh_btn.click( fn=load_voices, inputs=None, outputs=[language_dd, voice_dd, voices_state], ) def synthesize_with_custom(t: str, selected_voice: str, custom: str, use_ssml_val: bool, rate_val: str, pitch_val: str): voice = custom.strip() if (custom and custom.strip()) else selected_voice return synthesize(t, voice, use_ssml_val, rate_val, pitch_val) btn.click( fn=synthesize_with_custom, inputs=[text, voice_dd, custom_voice, use_ssml, rate, pitch], outputs=[audio], ) if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))