|
|
import os |
|
|
import html |
|
|
import shutil |
|
|
import subprocess |
|
|
import tempfile |
|
|
from typing import Optional |
|
|
|
|
|
import gradio as gr |
|
|
|
|
|
|
|
|
DESCRIPTION = """ |
|
|
Mimic 3 TTS on Hugging Face Spaces (Gradio) |
|
|
|
|
|
- Uses the Mimic 3 CLI under-the-hood and returns MP3 audio (falls back to WAV if conversion fails). |
|
|
- Leave the Voice Key blank to use the default voice, or provide a specific key (e.g., `en_US/cmu-arctic_low`). |
|
|
- You can optionally wrap the input in SSML for rate/pitch by toggling the advanced options. |
|
|
|
|
|
Note: The first run may download voice models and can take longer. |
|
|
""" |
|
|
|
|
|
|
|
|
def build_text(text: str, use_ssml: bool, rate: Optional[str], pitch: Optional[str]) -> str: |
|
|
text = text or "" |
|
|
if not use_ssml or (not rate and not pitch): |
|
|
return text |
|
|
|
|
|
|
|
|
|
|
|
rate_attr = f' rate="{rate.strip()}"' if rate else "" |
|
|
pitch_attr = f' pitch="{pitch.strip()}"' if pitch else "" |
|
|
return f"<speak><prosody{rate_attr}{pitch_attr}>{html.escape(text)}</prosody></speak>" |
|
|
|
|
|
|
|
|
def synthesize(text: str, voice_key: str, use_ssml: bool, rate: str, pitch: str): |
|
|
if not text or not text.strip(): |
|
|
return None |
|
|
|
|
|
input_text = build_text(text.strip(), use_ssml, rate, pitch) |
|
|
|
|
|
|
|
|
cmd = ["mimic3"] |
|
|
if voice_key and voice_key.strip(): |
|
|
cmd += ["--voice", voice_key.strip()] |
|
|
if use_ssml: |
|
|
cmd += ["--ssml"] |
|
|
cmd += [input_text] |
|
|
|
|
|
try: |
|
|
|
|
|
proc = subprocess.run( |
|
|
cmd, |
|
|
stdout=subprocess.PIPE, |
|
|
stderr=subprocess.PIPE, |
|
|
check=False, |
|
|
) |
|
|
if proc.returncode != 0: |
|
|
err = proc.stderr.decode(errors="ignore") |
|
|
raise gr.Error(f"Mimic 3 failed (code {proc.returncode}).\n\n{err}") |
|
|
|
|
|
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp: |
|
|
tmp.write(proc.stdout) |
|
|
wav_path = tmp.name |
|
|
|
|
|
ffmpeg_path = shutil.which("ffmpeg") |
|
|
if not ffmpeg_path: |
|
|
return wav_path |
|
|
|
|
|
mp3_fd, mp3_path = tempfile.mkstemp(suffix=".mp3") |
|
|
os.close(mp3_fd) |
|
|
|
|
|
convert = subprocess.run( |
|
|
[ffmpeg_path, "-y", "-i", wav_path, "-codec:a", "libmp3lame", "-qscale:a", "4", mp3_path], |
|
|
stdout=subprocess.PIPE, |
|
|
stderr=subprocess.PIPE, |
|
|
check=False, |
|
|
) |
|
|
|
|
|
if convert.returncode == 0 and os.path.exists(mp3_path): |
|
|
os.remove(wav_path) |
|
|
return mp3_path |
|
|
|
|
|
|
|
|
if os.path.exists(mp3_path): |
|
|
os.remove(mp3_path) |
|
|
return wav_path |
|
|
except FileNotFoundError: |
|
|
|
|
|
raise gr.Error("mimic3 CLI not found. Ensure package 'mycroft-mimic3-tts' is installed and available in PATH.") |
|
|
except Exception as e: |
|
|
raise gr.Error(str(e)) |
|
|
|
|
|
|
|
|
def _parse_voices(output: str): |
|
|
|
|
|
mapping = {} |
|
|
for line in output.splitlines(): |
|
|
line = line.strip() |
|
|
if not line: |
|
|
continue |
|
|
|
|
|
key = line.split()[0] |
|
|
if "/" in key: |
|
|
lang = key.split("/", 1)[0] |
|
|
else: |
|
|
|
|
|
lang = "other" |
|
|
mapping.setdefault(lang, []).append(key) |
|
|
|
|
|
for lang in mapping: |
|
|
mapping[lang].sort() |
|
|
return mapping |
|
|
|
|
|
|
|
|
def load_voices(): |
|
|
try: |
|
|
proc = subprocess.run(["mimic3", "--voices"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=False) |
|
|
if proc.returncode != 0: |
|
|
err = proc.stderr.decode(errors="ignore") |
|
|
raise gr.Error(f"Failed to list voices.\n\n{err}") |
|
|
mapping = _parse_voices(proc.stdout.decode(errors="ignore")) |
|
|
if not mapping: |
|
|
raise gr.Error("No voices found. Try again after models are available.") |
|
|
languages = sorted(mapping.keys()) |
|
|
default_lang = languages[0] |
|
|
voices = mapping[default_lang] |
|
|
|
|
|
|
|
|
def _lang_label(code: str) -> str: |
|
|
|
|
|
parts = code.split("_", 1) |
|
|
base = parts[0].lower() |
|
|
region = parts[1] if len(parts) > 1 else None |
|
|
names = { |
|
|
"en": "English", |
|
|
"ko": "Korean", |
|
|
"de": "German", |
|
|
"es": "Spanish", |
|
|
"fr": "French", |
|
|
"it": "Italian", |
|
|
"pt": "Portuguese", |
|
|
"nl": "Dutch", |
|
|
"sv": "Swedish", |
|
|
"no": "Norwegian", |
|
|
"da": "Danish", |
|
|
"fi": "Finnish", |
|
|
"pl": "Polish", |
|
|
"ru": "Russian", |
|
|
"tr": "Turkish", |
|
|
"ar": "Arabic", |
|
|
"hi": "Hindi", |
|
|
"ja": "Japanese", |
|
|
"zh": "Chinese", |
|
|
} |
|
|
base_name = names.get(base, code) |
|
|
return f"{base_name} ({region})" if region else base_name |
|
|
|
|
|
language_choices = [( _lang_label(code), code) for code in languages] |
|
|
|
|
|
|
|
|
return ( |
|
|
gr.update(choices=language_choices, value=default_lang), |
|
|
gr.update(choices=voices, value=voices[0] if voices else None), |
|
|
mapping, |
|
|
) |
|
|
except FileNotFoundError: |
|
|
raise gr.Error("mimic3 CLI not found. Ensure 'mycroft-mimic3-tts' is installed.") |
|
|
|
|
|
|
|
|
def on_language_change(lang: str, mapping: dict): |
|
|
voices = mapping.get(lang, []) if isinstance(mapping, dict) else [] |
|
|
return gr.update(choices=voices, value=(voices[0] if voices else None)) |
|
|
|
|
|
|
|
|
def filter_voices(search: str, lang: str, mapping: dict): |
|
|
voices = mapping.get(lang, []) if isinstance(mapping, dict) else [] |
|
|
if search: |
|
|
s = search.strip().lower() |
|
|
voices = [v for v in voices if s in v.lower()] |
|
|
return gr.update(choices=voices, value=(voices[0] if voices else None)) |
|
|
|
|
|
|
|
|
with gr.Blocks(title="Mimic 3 TTS") as demo: |
|
|
gr.Markdown(f"# Mimic 3 TTS\n{DESCRIPTION}") |
|
|
|
|
|
with gr.Row(): |
|
|
text = gr.Textbox(label="Text", placeholder="Type text to synthesize…", lines=4) |
|
|
|
|
|
with gr.Row(): |
|
|
language_dd = gr.Dropdown(label="Language", choices=[], interactive=True) |
|
|
voice_dd = gr.Dropdown(label="Voice", choices=[], interactive=True) |
|
|
with gr.Row(): |
|
|
voice_search = gr.Textbox(label="Voice Search (filters by language)", placeholder="Type to filter voices, e.g., 'ko' or 'female' if present in key") |
|
|
refresh_btn = gr.Button("Refresh Voices") |
|
|
with gr.Row(): |
|
|
custom_voice = gr.Textbox(label="Custom Voice Key (optional)", placeholder="Overrides Voice dropdown if provided") |
|
|
voices_state = gr.State({}) |
|
|
|
|
|
with gr.Accordion("Advanced (SSML)", open=False): |
|
|
use_ssml = gr.Checkbox(label="Use SSML prosody for rate/pitch", value=False) |
|
|
with gr.Row(): |
|
|
rate = gr.Textbox(label="Rate (e.g., 85%, 110%)", placeholder="Optional") |
|
|
pitch = gr.Textbox(label="Pitch (e.g., +2st, -2st)", placeholder="Optional") |
|
|
|
|
|
with gr.Row(): |
|
|
btn = gr.Button("Synthesize", variant="primary") |
|
|
|
|
|
audio = gr.Audio(label="Output Audio", type="filepath") |
|
|
|
|
|
|
|
|
demo.load( |
|
|
fn=load_voices, |
|
|
inputs=None, |
|
|
outputs=[language_dd, voice_dd, voices_state], |
|
|
) |
|
|
|
|
|
|
|
|
language_dd.change( |
|
|
fn=on_language_change, |
|
|
inputs=[language_dd, voices_state], |
|
|
outputs=[voice_dd], |
|
|
) |
|
|
|
|
|
|
|
|
voice_search.change( |
|
|
fn=filter_voices, |
|
|
inputs=[voice_search, language_dd, voices_state], |
|
|
outputs=[voice_dd], |
|
|
) |
|
|
|
|
|
|
|
|
refresh_btn.click( |
|
|
fn=load_voices, |
|
|
inputs=None, |
|
|
outputs=[language_dd, voice_dd, voices_state], |
|
|
) |
|
|
|
|
|
def synthesize_with_custom(t: str, selected_voice: str, custom: str, use_ssml_val: bool, rate_val: str, pitch_val: str): |
|
|
voice = custom.strip() if (custom and custom.strip()) else selected_voice |
|
|
return synthesize(t, voice, use_ssml_val, rate_val, pitch_val) |
|
|
|
|
|
btn.click( |
|
|
fn=synthesize_with_custom, |
|
|
inputs=[text, voice_dd, custom_voice, use_ssml, rate, pitch], |
|
|
outputs=[audio], |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860))) |
|
|
|