Spaces:

R-TA
/

TTS

Sleeping

TTS

File size: 8,770 Bytes

import os
import html
import shutil
import subprocess
import tempfile
from typing import Optional

import gradio as gr


DESCRIPTION = """
Mimic 3 TTS on Hugging Face Spaces (Gradio)

- Uses the Mimic 3 CLI under-the-hood and returns MP3 audio (falls back to WAV if conversion fails).
- Leave the Voice Key blank to use the default voice, or provide a specific key (e.g., `en_US/cmu-arctic_low`).
- You can optionally wrap the input in SSML for rate/pitch by toggling the advanced options.

Note: The first run may download voice models and can take longer.
"""


def build_text(text: str, use_ssml: bool, rate: Optional[str], pitch: Optional[str]) -> str:
    text = text or ""
    if not use_ssml or (not rate and not pitch):
        return text

    # Wrap text with SSML prosody if adjustments were requested.
    # Supported values for rate/pitch follow SSML conventions, e.g. "85%", "+2st", "-10%"
    rate_attr = f' rate="{rate.strip()}"' if rate else ""
    pitch_attr = f' pitch="{pitch.strip()}"' if pitch else ""
    return f"<speak><prosody{rate_attr}{pitch_attr}>{html.escape(text)}</prosody></speak>"


def synthesize(text: str, voice_key: str, use_ssml: bool, rate: str, pitch: str):
    if not text or not text.strip():
        return None

    input_text = build_text(text.strip(), use_ssml, rate, pitch)

    # Prepare the command
    cmd = ["mimic3"]
    if voice_key and voice_key.strip():
        cmd += ["--voice", voice_key.strip()]
    if use_ssml:
        cmd += ["--ssml"]
    cmd += [input_text]

    try:
        # Run mimic3 and capture the WAV from stdout
        proc = subprocess.run(
            cmd,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            check=False,
        )
        if proc.returncode != 0:
            err = proc.stderr.decode(errors="ignore")
            raise gr.Error(f"Mimic 3 failed (code {proc.returncode}).\n\n{err}")

        # Write the WAV bytes to a temp file
        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
            tmp.write(proc.stdout)
            wav_path = tmp.name

        ffmpeg_path = shutil.which("ffmpeg")
        if not ffmpeg_path:
            return wav_path

        mp3_fd, mp3_path = tempfile.mkstemp(suffix=".mp3")
        os.close(mp3_fd)

        convert = subprocess.run(
            [ffmpeg_path, "-y", "-i", wav_path, "-codec:a", "libmp3lame", "-qscale:a", "4", mp3_path],
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            check=False,
        )

        if convert.returncode == 0 and os.path.exists(mp3_path):
            os.remove(wav_path)
            return mp3_path

        # Conversion failed; clean up mp3 placeholder and return WAV instead
        if os.path.exists(mp3_path):
            os.remove(mp3_path)
        return wav_path
    except FileNotFoundError:
        # The mimic3 CLI was not found; show a helpful error in the UI
        raise gr.Error("mimic3 CLI not found. Ensure package 'mycroft-mimic3-tts' is installed and available in PATH.")
    except Exception as e:
        raise gr.Error(str(e))


def _parse_voices(output: str):
    # Returns (languages -> [voice_keys])
    mapping = {}
    for line in output.splitlines():
        line = line.strip()
        if not line:
            continue
        # Expect first token to be the voice key
        key = line.split()[0]
        if "/" in key:
            lang = key.split("/", 1)[0]
        else:
            # Fallback bucket
            lang = "other"
        mapping.setdefault(lang, []).append(key)
    # Sort voices
    for lang in mapping:
        mapping[lang].sort()
    return mapping


def load_voices():
    try:
        proc = subprocess.run(["mimic3", "--voices"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=False)
        if proc.returncode != 0:
            err = proc.stderr.decode(errors="ignore")
            raise gr.Error(f"Failed to list voices.\n\n{err}")
        mapping = _parse_voices(proc.stdout.decode(errors="ignore"))
        if not mapping:
            raise gr.Error("No voices found. Try again after models are available.")
        languages = sorted(mapping.keys())
        default_lang = languages[0]
        voices = mapping[default_lang]

        # Build human-readable labels for languages while using codes as values
        def _lang_label(code: str) -> str:
            # code like en_US, ko_KR
            parts = code.split("_", 1)
            base = parts[0].lower()
            region = parts[1] if len(parts) > 1 else None
            names = {
                "en": "English",
                "ko": "Korean",
                "de": "German",
                "es": "Spanish",
                "fr": "French",
                "it": "Italian",
                "pt": "Portuguese",
                "nl": "Dutch",
                "sv": "Swedish",
                "no": "Norwegian",
                "da": "Danish",
                "fi": "Finnish",
                "pl": "Polish",
                "ru": "Russian",
                "tr": "Turkish",
                "ar": "Arabic",
                "hi": "Hindi",
                "ja": "Japanese",
                "zh": "Chinese",
            }
            base_name = names.get(base, code)
            return f"{base_name} ({region})" if region else base_name

        language_choices = [( _lang_label(code), code) for code in languages]

        # Return updates for dropdowns and the mapping state
        return (
            gr.update(choices=language_choices, value=default_lang),
            gr.update(choices=voices, value=voices[0] if voices else None),
            mapping,
        )
    except FileNotFoundError:
        raise gr.Error("mimic3 CLI not found. Ensure 'mycroft-mimic3-tts' is installed.")


def on_language_change(lang: str, mapping: dict):
    voices = mapping.get(lang, []) if isinstance(mapping, dict) else []
    return gr.update(choices=voices, value=(voices[0] if voices else None))


def filter_voices(search: str, lang: str, mapping: dict):
    voices = mapping.get(lang, []) if isinstance(mapping, dict) else []
    if search:
        s = search.strip().lower()
        voices = [v for v in voices if s in v.lower()]
    return gr.update(choices=voices, value=(voices[0] if voices else None))


with gr.Blocks(title="Mimic 3 TTS") as demo:
    gr.Markdown(f"# Mimic 3 TTS\n{DESCRIPTION}")

    with gr.Row():
        text = gr.Textbox(label="Text", placeholder="Type text to synthesize…", lines=4)

    with gr.Row():
        language_dd = gr.Dropdown(label="Language", choices=[], interactive=True)
        voice_dd = gr.Dropdown(label="Voice", choices=[], interactive=True)
    with gr.Row():
        voice_search = gr.Textbox(label="Voice Search (filters by language)", placeholder="Type to filter voices, e.g., 'ko' or 'female' if present in key")
        refresh_btn = gr.Button("Refresh Voices")
    with gr.Row():
        custom_voice = gr.Textbox(label="Custom Voice Key (optional)", placeholder="Overrides Voice dropdown if provided")
        voices_state = gr.State({})

    with gr.Accordion("Advanced (SSML)", open=False):
        use_ssml = gr.Checkbox(label="Use SSML prosody for rate/pitch", value=False)
        with gr.Row():
            rate = gr.Textbox(label="Rate (e.g., 85%, 110%)", placeholder="Optional")
            pitch = gr.Textbox(label="Pitch (e.g., +2st, -2st)", placeholder="Optional")

    with gr.Row():
        btn = gr.Button("Synthesize", variant="primary")

    audio = gr.Audio(label="Output Audio", type="filepath")

    # Load voices at app start
    demo.load(
        fn=load_voices,
        inputs=None,
        outputs=[language_dd, voice_dd, voices_state],
    )

    # Change voices when language changes
    language_dd.change(
        fn=on_language_change,
        inputs=[language_dd, voices_state],
        outputs=[voice_dd],
    )

    # Filter voices as user types
    voice_search.change(
        fn=filter_voices,
        inputs=[voice_search, language_dd, voices_state],
        outputs=[voice_dd],
    )

    # Refresh voices list from CLI
    refresh_btn.click(
        fn=load_voices,
        inputs=None,
        outputs=[language_dd, voice_dd, voices_state],
    )

    def synthesize_with_custom(t: str, selected_voice: str, custom: str, use_ssml_val: bool, rate_val: str, pitch_val: str):
        voice = custom.strip() if (custom and custom.strip()) else selected_voice
        return synthesize(t, voice, use_ssml_val, rate_val, pitch_val)

    btn.click(
        fn=synthesize_with_custom,
        inputs=[text, voice_dd, custom_voice, use_ssml, rate, pitch],
        outputs=[audio],
    )

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))