Spaces:

R-TA
/

TTS

Sleeping

App Files Files Community

TTS / app.py

R-TA

Update app.py

1a961fb verified 4 months ago

raw

history blame contribute delete

8.77 kB

	import os
	import html
	import shutil
	import subprocess
	import tempfile
	from typing import Optional

	import gradio as gr


	DESCRIPTION = """
	Mimic 3 TTS on Hugging Face Spaces (Gradio)

	- Uses the Mimic 3 CLI under-the-hood and returns MP3 audio (falls back to WAV if conversion fails).
	- Leave the Voice Key blank to use the default voice, or provide a specific key (e.g., `en_US/cmu-arctic_low`).
	- You can optionally wrap the input in SSML for rate/pitch by toggling the advanced options.

	Note: The first run may download voice models and can take longer.
	"""


	def build_text(text: str, use_ssml: bool, rate: Optional[str], pitch: Optional[str]) -> str:
	text = text or ""
	if not use_ssml or (not rate and not pitch):
	return text

	# Wrap text with SSML prosody if adjustments were requested.
	# Supported values for rate/pitch follow SSML conventions, e.g. "85%", "+2st", "-10%"
	rate_attr = f' rate="{rate.strip()}"' if rate else ""
	pitch_attr = f' pitch="{pitch.strip()}"' if pitch else ""
	return f"<speak><prosody{rate_attr}{pitch_attr}>{html.escape(text)}</prosody></speak>"


	def synthesize(text: str, voice_key: str, use_ssml: bool, rate: str, pitch: str):
	if not text or not text.strip():
	return None

	input_text = build_text(text.strip(), use_ssml, rate, pitch)

	# Prepare the command
	cmd = ["mimic3"]
	if voice_key and voice_key.strip():
	cmd += ["--voice", voice_key.strip()]
	if use_ssml:
	cmd += ["--ssml"]
	cmd += [input_text]

	try:
	# Run mimic3 and capture the WAV from stdout
	proc = subprocess.run(
	cmd,
	stdout=subprocess.PIPE,
	stderr=subprocess.PIPE,
	check=False,
	)
	if proc.returncode != 0:
	err = proc.stderr.decode(errors="ignore")
	raise gr.Error(f"Mimic 3 failed (code {proc.returncode}).\n\n{err}")

	# Write the WAV bytes to a temp file
	with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
	tmp.write(proc.stdout)
	wav_path = tmp.name

	ffmpeg_path = shutil.which("ffmpeg")
	if not ffmpeg_path:
	return wav_path

	mp3_fd, mp3_path = tempfile.mkstemp(suffix=".mp3")
	os.close(mp3_fd)

	convert = subprocess.run(
	[ffmpeg_path, "-y", "-i", wav_path, "-codec:a", "libmp3lame", "-qscale:a", "4", mp3_path],
	stdout=subprocess.PIPE,
	stderr=subprocess.PIPE,
	check=False,
	)

	if convert.returncode == 0 and os.path.exists(mp3_path):
	os.remove(wav_path)
	return mp3_path

	# Conversion failed; clean up mp3 placeholder and return WAV instead
	if os.path.exists(mp3_path):
	os.remove(mp3_path)
	return wav_path
	except FileNotFoundError:
	# The mimic3 CLI was not found; show a helpful error in the UI
	raise gr.Error("mimic3 CLI not found. Ensure package 'mycroft-mimic3-tts' is installed and available in PATH.")
	except Exception as e:
	raise gr.Error(str(e))


	def _parse_voices(output: str):
	# Returns (languages -> [voice_keys])
	mapping = {}
	for line in output.splitlines():
	line = line.strip()
	if not line:
	continue
	# Expect first token to be the voice key
	key = line.split()[0]
	if "/" in key:
	lang = key.split("/", 1)[0]
	else:
	# Fallback bucket
	lang = "other"
	mapping.setdefault(lang, []).append(key)
	# Sort voices
	for lang in mapping:
	mapping[lang].sort()
	return mapping


	def load_voices():
	try:
	proc = subprocess.run(["mimic3", "--voices"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=False)
	if proc.returncode != 0:
	err = proc.stderr.decode(errors="ignore")
	raise gr.Error(f"Failed to list voices.\n\n{err}")
	mapping = _parse_voices(proc.stdout.decode(errors="ignore"))
	if not mapping:
	raise gr.Error("No voices found. Try again after models are available.")
	languages = sorted(mapping.keys())
	default_lang = languages[0]
	voices = mapping[default_lang]

	# Build human-readable labels for languages while using codes as values
	def _lang_label(code: str) -> str:
	# code like en_US, ko_KR
	parts = code.split("_", 1)
	base = parts[0].lower()
	region = parts[1] if len(parts) > 1 else None
	names = {
	"en": "English",
	"ko": "Korean",
	"de": "German",
	"es": "Spanish",
	"fr": "French",
	"it": "Italian",
	"pt": "Portuguese",
	"nl": "Dutch",
	"sv": "Swedish",
	"no": "Norwegian",
	"da": "Danish",
	"fi": "Finnish",
	"pl": "Polish",
	"ru": "Russian",
	"tr": "Turkish",
	"ar": "Arabic",
	"hi": "Hindi",
	"ja": "Japanese",
	"zh": "Chinese",
	}
	base_name = names.get(base, code)
	return f"{base_name} ({region})" if region else base_name

	language_choices = [( _lang_label(code), code) for code in languages]

	# Return updates for dropdowns and the mapping state
	return (
	gr.update(choices=language_choices, value=default_lang),
	gr.update(choices=voices, value=voices[0] if voices else None),
	mapping,
	)
	except FileNotFoundError:
	raise gr.Error("mimic3 CLI not found. Ensure 'mycroft-mimic3-tts' is installed.")


	def on_language_change(lang: str, mapping: dict):
	voices = mapping.get(lang, []) if isinstance(mapping, dict) else []
	return gr.update(choices=voices, value=(voices[0] if voices else None))


	def filter_voices(search: str, lang: str, mapping: dict):
	voices = mapping.get(lang, []) if isinstance(mapping, dict) else []
	if search:
	s = search.strip().lower()
	voices = [v for v in voices if s in v.lower()]
	return gr.update(choices=voices, value=(voices[0] if voices else None))


	with gr.Blocks(title="Mimic 3 TTS") as demo:
	gr.Markdown(f"# Mimic 3 TTS\n{DESCRIPTION}")

	with gr.Row():
	text = gr.Textbox(label="Text", placeholder="Type text to synthesize…", lines=4)

	with gr.Row():
	language_dd = gr.Dropdown(label="Language", choices=[], interactive=True)
	voice_dd = gr.Dropdown(label="Voice", choices=[], interactive=True)
	with gr.Row():
	voice_search = gr.Textbox(label="Voice Search (filters by language)", placeholder="Type to filter voices, e.g., 'ko' or 'female' if present in key")
	refresh_btn = gr.Button("Refresh Voices")
	with gr.Row():
	custom_voice = gr.Textbox(label="Custom Voice Key (optional)", placeholder="Overrides Voice dropdown if provided")
	voices_state = gr.State({})

	with gr.Accordion("Advanced (SSML)", open=False):
	use_ssml = gr.Checkbox(label="Use SSML prosody for rate/pitch", value=False)
	with gr.Row():
	rate = gr.Textbox(label="Rate (e.g., 85%, 110%)", placeholder="Optional")
	pitch = gr.Textbox(label="Pitch (e.g., +2st, -2st)", placeholder="Optional")

	with gr.Row():
	btn = gr.Button("Synthesize", variant="primary")

	audio = gr.Audio(label="Output Audio", type="filepath")

	# Load voices at app start
	demo.load(
	fn=load_voices,
	inputs=None,
	outputs=[language_dd, voice_dd, voices_state],
	)

	# Change voices when language changes
	language_dd.change(
	fn=on_language_change,
	inputs=[language_dd, voices_state],
	outputs=[voice_dd],
	)

	# Filter voices as user types
	voice_search.change(
	fn=filter_voices,
	inputs=[voice_search, language_dd, voices_state],
	outputs=[voice_dd],
	)

	# Refresh voices list from CLI
	refresh_btn.click(
	fn=load_voices,
	inputs=None,
	outputs=[language_dd, voice_dd, voices_state],
	)

	def synthesize_with_custom(t: str, selected_voice: str, custom: str, use_ssml_val: bool, rate_val: str, pitch_val: str):
	voice = custom.strip() if (custom and custom.strip()) else selected_voice
	return synthesize(t, voice, use_ssml_val, rate_val, pitch_val)

	btn.click(
	fn=synthesize_with_custom,
	inputs=[text, voice_dd, custom_voice, use_ssml, rate, pitch],
	outputs=[audio],
	)

	if __name__ == "__main__":
	demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))