TTS / app.py
R-TA's picture
Update app.py
1a961fb verified
import os
import html
import shutil
import subprocess
import tempfile
from typing import Optional
import gradio as gr
DESCRIPTION = """
Mimic 3 TTS on Hugging Face Spaces (Gradio)
- Uses the Mimic 3 CLI under-the-hood and returns MP3 audio (falls back to WAV if conversion fails).
- Leave the Voice Key blank to use the default voice, or provide a specific key (e.g., `en_US/cmu-arctic_low`).
- You can optionally wrap the input in SSML for rate/pitch by toggling the advanced options.
Note: The first run may download voice models and can take longer.
"""
def build_text(text: str, use_ssml: bool, rate: Optional[str], pitch: Optional[str]) -> str:
text = text or ""
if not use_ssml or (not rate and not pitch):
return text
# Wrap text with SSML prosody if adjustments were requested.
# Supported values for rate/pitch follow SSML conventions, e.g. "85%", "+2st", "-10%"
rate_attr = f' rate="{rate.strip()}"' if rate else ""
pitch_attr = f' pitch="{pitch.strip()}"' if pitch else ""
return f"<speak><prosody{rate_attr}{pitch_attr}>{html.escape(text)}</prosody></speak>"
def synthesize(text: str, voice_key: str, use_ssml: bool, rate: str, pitch: str):
if not text or not text.strip():
return None
input_text = build_text(text.strip(), use_ssml, rate, pitch)
# Prepare the command
cmd = ["mimic3"]
if voice_key and voice_key.strip():
cmd += ["--voice", voice_key.strip()]
if use_ssml:
cmd += ["--ssml"]
cmd += [input_text]
try:
# Run mimic3 and capture the WAV from stdout
proc = subprocess.run(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
check=False,
)
if proc.returncode != 0:
err = proc.stderr.decode(errors="ignore")
raise gr.Error(f"Mimic 3 failed (code {proc.returncode}).\n\n{err}")
# Write the WAV bytes to a temp file
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
tmp.write(proc.stdout)
wav_path = tmp.name
ffmpeg_path = shutil.which("ffmpeg")
if not ffmpeg_path:
return wav_path
mp3_fd, mp3_path = tempfile.mkstemp(suffix=".mp3")
os.close(mp3_fd)
convert = subprocess.run(
[ffmpeg_path, "-y", "-i", wav_path, "-codec:a", "libmp3lame", "-qscale:a", "4", mp3_path],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
check=False,
)
if convert.returncode == 0 and os.path.exists(mp3_path):
os.remove(wav_path)
return mp3_path
# Conversion failed; clean up mp3 placeholder and return WAV instead
if os.path.exists(mp3_path):
os.remove(mp3_path)
return wav_path
except FileNotFoundError:
# The mimic3 CLI was not found; show a helpful error in the UI
raise gr.Error("mimic3 CLI not found. Ensure package 'mycroft-mimic3-tts' is installed and available in PATH.")
except Exception as e:
raise gr.Error(str(e))
def _parse_voices(output: str):
# Returns (languages -> [voice_keys])
mapping = {}
for line in output.splitlines():
line = line.strip()
if not line:
continue
# Expect first token to be the voice key
key = line.split()[0]
if "/" in key:
lang = key.split("/", 1)[0]
else:
# Fallback bucket
lang = "other"
mapping.setdefault(lang, []).append(key)
# Sort voices
for lang in mapping:
mapping[lang].sort()
return mapping
def load_voices():
try:
proc = subprocess.run(["mimic3", "--voices"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=False)
if proc.returncode != 0:
err = proc.stderr.decode(errors="ignore")
raise gr.Error(f"Failed to list voices.\n\n{err}")
mapping = _parse_voices(proc.stdout.decode(errors="ignore"))
if not mapping:
raise gr.Error("No voices found. Try again after models are available.")
languages = sorted(mapping.keys())
default_lang = languages[0]
voices = mapping[default_lang]
# Build human-readable labels for languages while using codes as values
def _lang_label(code: str) -> str:
# code like en_US, ko_KR
parts = code.split("_", 1)
base = parts[0].lower()
region = parts[1] if len(parts) > 1 else None
names = {
"en": "English",
"ko": "Korean",
"de": "German",
"es": "Spanish",
"fr": "French",
"it": "Italian",
"pt": "Portuguese",
"nl": "Dutch",
"sv": "Swedish",
"no": "Norwegian",
"da": "Danish",
"fi": "Finnish",
"pl": "Polish",
"ru": "Russian",
"tr": "Turkish",
"ar": "Arabic",
"hi": "Hindi",
"ja": "Japanese",
"zh": "Chinese",
}
base_name = names.get(base, code)
return f"{base_name} ({region})" if region else base_name
language_choices = [( _lang_label(code), code) for code in languages]
# Return updates for dropdowns and the mapping state
return (
gr.update(choices=language_choices, value=default_lang),
gr.update(choices=voices, value=voices[0] if voices else None),
mapping,
)
except FileNotFoundError:
raise gr.Error("mimic3 CLI not found. Ensure 'mycroft-mimic3-tts' is installed.")
def on_language_change(lang: str, mapping: dict):
voices = mapping.get(lang, []) if isinstance(mapping, dict) else []
return gr.update(choices=voices, value=(voices[0] if voices else None))
def filter_voices(search: str, lang: str, mapping: dict):
voices = mapping.get(lang, []) if isinstance(mapping, dict) else []
if search:
s = search.strip().lower()
voices = [v for v in voices if s in v.lower()]
return gr.update(choices=voices, value=(voices[0] if voices else None))
with gr.Blocks(title="Mimic 3 TTS") as demo:
gr.Markdown(f"# Mimic 3 TTS\n{DESCRIPTION}")
with gr.Row():
text = gr.Textbox(label="Text", placeholder="Type text to synthesize…", lines=4)
with gr.Row():
language_dd = gr.Dropdown(label="Language", choices=[], interactive=True)
voice_dd = gr.Dropdown(label="Voice", choices=[], interactive=True)
with gr.Row():
voice_search = gr.Textbox(label="Voice Search (filters by language)", placeholder="Type to filter voices, e.g., 'ko' or 'female' if present in key")
refresh_btn = gr.Button("Refresh Voices")
with gr.Row():
custom_voice = gr.Textbox(label="Custom Voice Key (optional)", placeholder="Overrides Voice dropdown if provided")
voices_state = gr.State({})
with gr.Accordion("Advanced (SSML)", open=False):
use_ssml = gr.Checkbox(label="Use SSML prosody for rate/pitch", value=False)
with gr.Row():
rate = gr.Textbox(label="Rate (e.g., 85%, 110%)", placeholder="Optional")
pitch = gr.Textbox(label="Pitch (e.g., +2st, -2st)", placeholder="Optional")
with gr.Row():
btn = gr.Button("Synthesize", variant="primary")
audio = gr.Audio(label="Output Audio", type="filepath")
# Load voices at app start
demo.load(
fn=load_voices,
inputs=None,
outputs=[language_dd, voice_dd, voices_state],
)
# Change voices when language changes
language_dd.change(
fn=on_language_change,
inputs=[language_dd, voices_state],
outputs=[voice_dd],
)
# Filter voices as user types
voice_search.change(
fn=filter_voices,
inputs=[voice_search, language_dd, voices_state],
outputs=[voice_dd],
)
# Refresh voices list from CLI
refresh_btn.click(
fn=load_voices,
inputs=None,
outputs=[language_dd, voice_dd, voices_state],
)
def synthesize_with_custom(t: str, selected_voice: str, custom: str, use_ssml_val: bool, rate_val: str, pitch_val: str):
voice = custom.strip() if (custom and custom.strip()) else selected_voice
return synthesize(t, voice, use_ssml_val, rate_val, pitch_val)
btn.click(
fn=synthesize_with_custom,
inputs=[text, voice_dd, custom_voice, use_ssml, rate, pitch],
outputs=[audio],
)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))