Spaces:

Scicom-intl
/

Multilingual-Expressive-TTS

Running on Zero

App Files Files Community

Multilingual-Expressive-TTS / app.py

huseinzol05

fix info

10693d8 2 months ago

raw

history blame contribute delete

7.59 kB

	import gradio as gr

	try:
	import spaces
	except ImportError:
	class spaces:
	@staticmethod
	def GPU(fn):
	return fn

	import torch
	import numpy as np
	import re
	from neucodec import NeuCodec
	from transformers import AutoModelForCausalLM, AutoTokenizer

	# ── Model config ─────────────────────────────────────────────────────────────

	MODEL_IDS = {
	"0.6B": "Scicom-intl/Multilingual-Expressive-TTS-0.6B",
	"1.7B": "Scicom-intl/Multilingual-Expressive-TTS-1.7B",
	}

	DEFAULT_SPEAKERS = [
	"multilingual-tts_audio_Grace",
	"elevenlabs_audio_Alexandr Vlasov - Professional Voiceover",
	"multilingual-tts_audio_Domi",
	"gemini-flash-2.0-speech_data_audio_kore",
	"genshin-voice_audio_Rahman",
	"multilingual-tts_audio_Nicole",
	"OutteTTS-urdu-dataset_audio_uat_speaker",
	]

	SAMPLE_RATE = 24000

	_loaded = {}
	codec = None


	def load_model(size: str):
	if size not in _loaded:
	model_name = MODEL_IDS[size]
	model = AutoModelForCausalLM.from_pretrained(model_name).cuda()
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	_loaded[size] = (model, tokenizer)
	return _loaded[size]

	def load_neucodec():
	global codec
	if codec is None:
	codec = NeuCodec.from_pretrained("neuphonic/neucodec")
	_ = codec.eval().to('cuda')
	return codec

	@spaces.GPU
	def generate(speaker_choice: str, custom_speaker: str,
	model_size: str, text: str, description: str, temperature: float = 0.8):

	# Resolve speaker name
	speaker = custom_speaker.strip() if speaker_choice == "Custom..." else speaker_choice
	if not speaker:
	raise gr.Error("Please enter a custom speaker name.")
	if not text.strip():
	raise gr.Error("Please enter some text to synthesize.")

	gr.Info("Loading model...")
	model, tokenizer = load_model(model_size)
	gr.Info("Loading codec...")
	codec = load_neucodec()
	gr.Info("Generating audio...")

	if len(description):
	prompt = f"<\|im_start\|>{speaker}: {text}<\|description\|>{description}<\|speech_start\|>"
	else:
	prompt = f"<\|im_start\|>{speaker}: {text}<\|speech_start\|>"

	inputs = tokenizer(prompt,return_tensors="pt", add_special_tokens=True).to(model.device)

	with torch.no_grad():
	outputs = model.generate(
	**inputs,
	max_new_tokens=2048,
	do_sample=True,
	temperature=temperature,
	repetition_penalty=1.15,
	)

	generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False)
	audio_tokens = re.findall(r'<\\|s_(\d+)\\|>', generated_text.split('<\|speech_start\|>')[1])
	audio_tokens = [int(token) for token in audio_tokens]
	audio_codes = torch.tensor(audio_tokens)[None, None]

	with torch.no_grad():
	audio_waveform = codec.decode_code(audio_codes.cuda())

	audio_np = audio_waveform[0, 0].cpu().numpy()
	return SAMPLE_RATE, audio_np


	# ── UI ────────────────────────────────────────────────────────────────────────

	with gr.Blocks(title="Expressive Multilingual TTS") as demo:
	gr.Markdown("""# Expressive Multilingual TTS

	A multilingual expressive text-to-speech system available in two sizes:
	- 0.6B — [Scicom-intl/Multilingual-Expressive-TTS-0.6B](https://huggingface.co/Scicom-intl/Multilingual-Expressive-TTS-0.6B)
	- 1.7B — [Scicom-intl/Multilingual-Expressive-TTS-1.7B](https://huggingface.co/Scicom-intl/Multilingual-Expressive-TTS-1.7B)

	The model supports mid-sentence language switching across many languages in a single utterance, e.g.:
	> Hi nama saya Husein, I am so cute, 我喜欢吃鸡饭, boire du thé glacé, ולהירגע על החוף, وأحب أن أتعرض لبعض أشعة الشمس.
	""")

	with gr.Row():
	with gr.Column():
	speaker_dropdown = gr.Dropdown(
	choices=DEFAULT_SPEAKERS + ["Custom..."],
	value=DEFAULT_SPEAKERS[0],
	label="Speaker",
	)
	custom_speaker_label = gr.Markdown(
	"or you can use any speaker name from "
	"[malaysia-ai/Multilingual-TTS](https://huggingface.co/datasets/malaysia-ai/Multilingual-TTS), "
	"e.g. `700h-tr-turkish-text-to-speech_audio_0`",
	visible=False,
	)
	custom_speaker = gr.Textbox(
	label="Custom speaker name",
	placeholder="Type your own speaker name...",
	visible=False,
	)
	model_size = gr.Radio(
	choices=["0.6B", "1.7B"],
	value="0.6B",
	label="Model size",
	)
	text_input = gr.Textbox(
	label="Text",
	placeholder="Enter the text to synthesize...",
	lines=4,
	)
	description_input = gr.Textbox(
	label="Description",
	info="Optional voice style description. Note: the model's main strength is multilingual — it may not always follow the description precisely.",
	placeholder="Describe the voice style, e.g. 'A calm female voice with a slight Malaysian accent'",
	lines=3,
	)
	temperature = gr.Slider(
	minimum=0.5, maximum=1.2, value=0.8, step=0.05,
	label="Temperature",
	)
	generate_btn = gr.Button("Generate", variant="primary")

	with gr.Column():
	audio_output = gr.Audio(label="Output", type="numpy")

	# Show/hide custom speaker label + textbox
	def toggle_custom(choice):
	visible = choice == "Custom..."
	return gr.update(visible=visible), gr.update(visible=visible)

	speaker_dropdown.change(toggle_custom, inputs=speaker_dropdown, outputs=[custom_speaker_label, custom_speaker])

	generate_btn.click(
	fn=generate,
	inputs=[speaker_dropdown, custom_speaker, model_size, text_input, description_input, temperature],
	outputs=audio_output,
	)

	gr.Markdown("Note: Example texts are translated using Google Translate and may not be accurate — for demo purposes only.")

	gr.Examples(
	examples=[
	["multilingual-tts_audio_Grace", "", "1.7B", "Hi nama saya Husein, I am so cute, 我喜欢吃鸡饭, boire du thé glacé, ולהירגע על החוף, وأحب أن أتعرض لبعض أشعة الشمس, हैलो आज आप कैसे हैं? Здравствуйте, как у вас дела сегодня?", "A warm and friendly female voice.", 0.8],
	["genshin-voice_audio_Rahman", "", "1.7B", "Selamat pagi, apa khabar? صبح بخیر، حال و احوالت چطوره؟, Dzień dobry, jak się masz?", "A calm male voice with a Malaysian accent.", 0.8],
	["multilingual-tts_audio_Domi", "", "1.7B", "The weather is beautiful today, Veðrið er fallegt í dag, 오늘은 날씨가 정말 좋네요, 今日は天気がとても良いです, מזג האוויר יפהפה היום, अद्यत्वे मौसमः सुन्दरः अस्ति.", "An expressive and cheerful male voice.", 0.8],
	],
	inputs=[speaker_dropdown, custom_speaker, model_size, text_input, description_input, temperature],
	)

	demo.launch()