File size: 8,770 Bytes
bb35394
 
1a961fb
bb35394
 
041788a
bb35394
 
 
 
 
 
 
1a961fb
bb35394
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1a961fb
bb35394
 
1a961fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bb35394
 
 
 
 
 
 
20b568c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
041788a
20b568c
 
 
 
 
041788a
20b568c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
041788a
 
 
 
 
 
 
 
bb35394
 
 
 
 
 
 
20b568c
 
041788a
 
 
 
 
20b568c
bb35394
 
 
 
 
 
 
 
 
 
 
 
20b568c
 
 
 
 
 
 
 
 
 
 
 
 
 
041788a
 
 
 
 
bb35394
 
041788a
 
 
 
 
 
2f04210
041788a
 
 
2f04210
041788a
 
 
 
 
20b568c
041788a
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
import os
import html
import shutil
import subprocess
import tempfile
from typing import Optional

import gradio as gr


DESCRIPTION = """
Mimic 3 TTS on Hugging Face Spaces (Gradio)

- Uses the Mimic 3 CLI under-the-hood and returns MP3 audio (falls back to WAV if conversion fails).
- Leave the Voice Key blank to use the default voice, or provide a specific key (e.g., `en_US/cmu-arctic_low`).
- You can optionally wrap the input in SSML for rate/pitch by toggling the advanced options.

Note: The first run may download voice models and can take longer.
"""


def build_text(text: str, use_ssml: bool, rate: Optional[str], pitch: Optional[str]) -> str:
    text = text or ""
    if not use_ssml or (not rate and not pitch):
        return text

    # Wrap text with SSML prosody if adjustments were requested.
    # Supported values for rate/pitch follow SSML conventions, e.g. "85%", "+2st", "-10%"
    rate_attr = f' rate="{rate.strip()}"' if rate else ""
    pitch_attr = f' pitch="{pitch.strip()}"' if pitch else ""
    return f"<speak><prosody{rate_attr}{pitch_attr}>{html.escape(text)}</prosody></speak>"


def synthesize(text: str, voice_key: str, use_ssml: bool, rate: str, pitch: str):
    if not text or not text.strip():
        return None

    input_text = build_text(text.strip(), use_ssml, rate, pitch)

    # Prepare the command
    cmd = ["mimic3"]
    if voice_key and voice_key.strip():
        cmd += ["--voice", voice_key.strip()]
    if use_ssml:
        cmd += ["--ssml"]
    cmd += [input_text]

    try:
        # Run mimic3 and capture the WAV from stdout
        proc = subprocess.run(
            cmd,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            check=False,
        )
        if proc.returncode != 0:
            err = proc.stderr.decode(errors="ignore")
            raise gr.Error(f"Mimic 3 failed (code {proc.returncode}).\n\n{err}")

        # Write the WAV bytes to a temp file
        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
            tmp.write(proc.stdout)
            wav_path = tmp.name

        ffmpeg_path = shutil.which("ffmpeg")
        if not ffmpeg_path:
            return wav_path

        mp3_fd, mp3_path = tempfile.mkstemp(suffix=".mp3")
        os.close(mp3_fd)

        convert = subprocess.run(
            [ffmpeg_path, "-y", "-i", wav_path, "-codec:a", "libmp3lame", "-qscale:a", "4", mp3_path],
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            check=False,
        )

        if convert.returncode == 0 and os.path.exists(mp3_path):
            os.remove(wav_path)
            return mp3_path

        # Conversion failed; clean up mp3 placeholder and return WAV instead
        if os.path.exists(mp3_path):
            os.remove(mp3_path)
        return wav_path
    except FileNotFoundError:
        # The mimic3 CLI was not found; show a helpful error in the UI
        raise gr.Error("mimic3 CLI not found. Ensure package 'mycroft-mimic3-tts' is installed and available in PATH.")
    except Exception as e:
        raise gr.Error(str(e))


def _parse_voices(output: str):
    # Returns (languages -> [voice_keys])
    mapping = {}
    for line in output.splitlines():
        line = line.strip()
        if not line:
            continue
        # Expect first token to be the voice key
        key = line.split()[0]
        if "/" in key:
            lang = key.split("/", 1)[0]
        else:
            # Fallback bucket
            lang = "other"
        mapping.setdefault(lang, []).append(key)
    # Sort voices
    for lang in mapping:
        mapping[lang].sort()
    return mapping


def load_voices():
    try:
        proc = subprocess.run(["mimic3", "--voices"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=False)
        if proc.returncode != 0:
            err = proc.stderr.decode(errors="ignore")
            raise gr.Error(f"Failed to list voices.\n\n{err}")
        mapping = _parse_voices(proc.stdout.decode(errors="ignore"))
        if not mapping:
            raise gr.Error("No voices found. Try again after models are available.")
        languages = sorted(mapping.keys())
        default_lang = languages[0]
        voices = mapping[default_lang]

        # Build human-readable labels for languages while using codes as values
        def _lang_label(code: str) -> str:
            # code like en_US, ko_KR
            parts = code.split("_", 1)
            base = parts[0].lower()
            region = parts[1] if len(parts) > 1 else None
            names = {
                "en": "English",
                "ko": "Korean",
                "de": "German",
                "es": "Spanish",
                "fr": "French",
                "it": "Italian",
                "pt": "Portuguese",
                "nl": "Dutch",
                "sv": "Swedish",
                "no": "Norwegian",
                "da": "Danish",
                "fi": "Finnish",
                "pl": "Polish",
                "ru": "Russian",
                "tr": "Turkish",
                "ar": "Arabic",
                "hi": "Hindi",
                "ja": "Japanese",
                "zh": "Chinese",
            }
            base_name = names.get(base, code)
            return f"{base_name} ({region})" if region else base_name

        language_choices = [( _lang_label(code), code) for code in languages]

        # Return updates for dropdowns and the mapping state
        return (
            gr.update(choices=language_choices, value=default_lang),
            gr.update(choices=voices, value=voices[0] if voices else None),
            mapping,
        )
    except FileNotFoundError:
        raise gr.Error("mimic3 CLI not found. Ensure 'mycroft-mimic3-tts' is installed.")


def on_language_change(lang: str, mapping: dict):
    voices = mapping.get(lang, []) if isinstance(mapping, dict) else []
    return gr.update(choices=voices, value=(voices[0] if voices else None))


def filter_voices(search: str, lang: str, mapping: dict):
    voices = mapping.get(lang, []) if isinstance(mapping, dict) else []
    if search:
        s = search.strip().lower()
        voices = [v for v in voices if s in v.lower()]
    return gr.update(choices=voices, value=(voices[0] if voices else None))


with gr.Blocks(title="Mimic 3 TTS") as demo:
    gr.Markdown(f"# Mimic 3 TTS\n{DESCRIPTION}")

    with gr.Row():
        text = gr.Textbox(label="Text", placeholder="Type text to synthesize…", lines=4)

    with gr.Row():
        language_dd = gr.Dropdown(label="Language", choices=[], interactive=True)
        voice_dd = gr.Dropdown(label="Voice", choices=[], interactive=True)
    with gr.Row():
        voice_search = gr.Textbox(label="Voice Search (filters by language)", placeholder="Type to filter voices, e.g., 'ko' or 'female' if present in key")
        refresh_btn = gr.Button("Refresh Voices")
    with gr.Row():
        custom_voice = gr.Textbox(label="Custom Voice Key (optional)", placeholder="Overrides Voice dropdown if provided")
        voices_state = gr.State({})

    with gr.Accordion("Advanced (SSML)", open=False):
        use_ssml = gr.Checkbox(label="Use SSML prosody for rate/pitch", value=False)
        with gr.Row():
            rate = gr.Textbox(label="Rate (e.g., 85%, 110%)", placeholder="Optional")
            pitch = gr.Textbox(label="Pitch (e.g., +2st, -2st)", placeholder="Optional")

    with gr.Row():
        btn = gr.Button("Synthesize", variant="primary")

    audio = gr.Audio(label="Output Audio", type="filepath")

    # Load voices at app start
    demo.load(
        fn=load_voices,
        inputs=None,
        outputs=[language_dd, voice_dd, voices_state],
    )

    # Change voices when language changes
    language_dd.change(
        fn=on_language_change,
        inputs=[language_dd, voices_state],
        outputs=[voice_dd],
    )

    # Filter voices as user types
    voice_search.change(
        fn=filter_voices,
        inputs=[voice_search, language_dd, voices_state],
        outputs=[voice_dd],
    )

    # Refresh voices list from CLI
    refresh_btn.click(
        fn=load_voices,
        inputs=None,
        outputs=[language_dd, voice_dd, voices_state],
    )

    def synthesize_with_custom(t: str, selected_voice: str, custom: str, use_ssml_val: bool, rate_val: str, pitch_val: str):
        voice = custom.strip() if (custom and custom.strip()) else selected_voice
        return synthesize(t, voice, use_ssml_val, rate_val, pitch_val)

    btn.click(
        fn=synthesize_with_custom,
        inputs=[text, voice_dd, custom_voice, use_ssml, rate, pitch],
        outputs=[audio],
    )

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))