huseinzol05's picture
fix info
10693d8
import gradio as gr
try:
import spaces
except ImportError:
class spaces:
@staticmethod
def GPU(fn):
return fn
import torch
import numpy as np
import re
from neucodec import NeuCodec
from transformers import AutoModelForCausalLM, AutoTokenizer
# ── Model config ─────────────────────────────────────────────────────────────
MODEL_IDS = {
"0.6B": "Scicom-intl/Multilingual-Expressive-TTS-0.6B",
"1.7B": "Scicom-intl/Multilingual-Expressive-TTS-1.7B",
}
DEFAULT_SPEAKERS = [
"multilingual-tts_audio_Grace",
"elevenlabs_audio_Alexandr Vlasov - Professional Voiceover",
"multilingual-tts_audio_Domi",
"gemini-flash-2.0-speech_data_audio_kore",
"genshin-voice_audio_Rahman",
"multilingual-tts_audio_Nicole",
"OutteTTS-urdu-dataset_audio_uat_speaker",
]
SAMPLE_RATE = 24000
_loaded = {}
codec = None
def load_model(size: str):
if size not in _loaded:
model_name = MODEL_IDS[size]
model = AutoModelForCausalLM.from_pretrained(model_name).cuda()
tokenizer = AutoTokenizer.from_pretrained(model_name)
_loaded[size] = (model, tokenizer)
return _loaded[size]
def load_neucodec():
global codec
if codec is None:
codec = NeuCodec.from_pretrained("neuphonic/neucodec")
_ = codec.eval().to('cuda')
return codec
@spaces.GPU
def generate(speaker_choice: str, custom_speaker: str,
model_size: str, text: str, description: str, temperature: float = 0.8):
# Resolve speaker name
speaker = custom_speaker.strip() if speaker_choice == "Custom..." else speaker_choice
if not speaker:
raise gr.Error("Please enter a custom speaker name.")
if not text.strip():
raise gr.Error("Please enter some text to synthesize.")
gr.Info("Loading model...")
model, tokenizer = load_model(model_size)
gr.Info("Loading codec...")
codec = load_neucodec()
gr.Info("Generating audio...")
if len(description):
prompt = f"<|im_start|>{speaker}: {text}<|description|>{description}<|speech_start|>"
else:
prompt = f"<|im_start|>{speaker}: {text}<|speech_start|>"
inputs = tokenizer(prompt,return_tensors="pt", add_special_tokens=True).to(model.device)
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=2048,
do_sample=True,
temperature=temperature,
repetition_penalty=1.15,
)
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False)
audio_tokens = re.findall(r'<\|s_(\d+)\|>', generated_text.split('<|speech_start|>')[1])
audio_tokens = [int(token) for token in audio_tokens]
audio_codes = torch.tensor(audio_tokens)[None, None]
with torch.no_grad():
audio_waveform = codec.decode_code(audio_codes.cuda())
audio_np = audio_waveform[0, 0].cpu().numpy()
return SAMPLE_RATE, audio_np
# ── UI ────────────────────────────────────────────────────────────────────────
with gr.Blocks(title="Expressive Multilingual TTS") as demo:
gr.Markdown("""# Expressive Multilingual TTS
A multilingual expressive text-to-speech system available in two sizes:
- **0.6B** — [Scicom-intl/Multilingual-Expressive-TTS-0.6B](https://huggingface.co/Scicom-intl/Multilingual-Expressive-TTS-0.6B)
- **1.7B** — [Scicom-intl/Multilingual-Expressive-TTS-1.7B](https://huggingface.co/Scicom-intl/Multilingual-Expressive-TTS-1.7B)
The model supports **mid-sentence language switching** across many languages in a single utterance, e.g.:
> *Hi nama saya Husein, I am so cute, 我喜欢吃鸡饭, boire du thé glacé, ולהירגע על החוף, وأحب أن أتعرض لبعض أشعة الشمس.*
""")
with gr.Row():
with gr.Column():
speaker_dropdown = gr.Dropdown(
choices=DEFAULT_SPEAKERS + ["Custom..."],
value=DEFAULT_SPEAKERS[0],
label="Speaker",
)
custom_speaker_label = gr.Markdown(
"or you can use any speaker name from "
"[malaysia-ai/Multilingual-TTS](https://huggingface.co/datasets/malaysia-ai/Multilingual-TTS), "
"e.g. `700h-tr-turkish-text-to-speech_audio_0`",
visible=False,
)
custom_speaker = gr.Textbox(
label="Custom speaker name",
placeholder="Type your own speaker name...",
visible=False,
)
model_size = gr.Radio(
choices=["0.6B", "1.7B"],
value="0.6B",
label="Model size",
)
text_input = gr.Textbox(
label="Text",
placeholder="Enter the text to synthesize...",
lines=4,
)
description_input = gr.Textbox(
label="Description",
info="Optional voice style description. Note: the model's main strength is multilingual — it may not always follow the description precisely.",
placeholder="Describe the voice style, e.g. 'A calm female voice with a slight Malaysian accent'",
lines=3,
)
temperature = gr.Slider(
minimum=0.5, maximum=1.2, value=0.8, step=0.05,
label="Temperature",
)
generate_btn = gr.Button("Generate", variant="primary")
with gr.Column():
audio_output = gr.Audio(label="Output", type="numpy")
# Show/hide custom speaker label + textbox
def toggle_custom(choice):
visible = choice == "Custom..."
return gr.update(visible=visible), gr.update(visible=visible)
speaker_dropdown.change(toggle_custom, inputs=speaker_dropdown, outputs=[custom_speaker_label, custom_speaker])
generate_btn.click(
fn=generate,
inputs=[speaker_dropdown, custom_speaker, model_size, text_input, description_input, temperature],
outputs=audio_output,
)
gr.Markdown("*Note: Example texts are translated using Google Translate and may not be accurate — for demo purposes only.*")
gr.Examples(
examples=[
["multilingual-tts_audio_Grace", "", "1.7B", "Hi nama saya Husein, I am so cute, 我喜欢吃鸡饭, boire du thé glacé, ולהירגע על החוף, وأحب أن أتعرض لبعض أشعة الشمس, हैलो आज आप कैसे हैं? Здравствуйте, как у вас дела сегодня?", "A warm and friendly female voice.", 0.8],
["genshin-voice_audio_Rahman", "", "1.7B", "Selamat pagi, apa khabar? صبح بخیر، حال و احوالت چطوره؟, Dzień dobry, jak się masz?", "A calm male voice with a Malaysian accent.", 0.8],
["multilingual-tts_audio_Domi", "", "1.7B", "The weather is beautiful today, Veðrið er fallegt í dag, 오늘은 날씨가 정말 좋네요, 今日は天気がとても良いです, מזג האוויר יפהפה היום, अद्यत्वे मौसमः सुन्दरः अस्ति.", "An expressive and cheerful male voice.", 0.8],
],
inputs=[speaker_dropdown, custom_speaker, model_size, text_input, description_input, temperature],
)
demo.launch()