Spaces:
Running on Zero
Running on Zero
| import gradio as gr | |
| try: | |
| import spaces | |
| except ImportError: | |
| class spaces: | |
| def GPU(fn): | |
| return fn | |
| import torch | |
| import numpy as np | |
| import re | |
| from neucodec import NeuCodec | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| # ── Model config ───────────────────────────────────────────────────────────── | |
| MODEL_IDS = { | |
| "0.6B": "Scicom-intl/Multilingual-Expressive-TTS-0.6B", | |
| "1.7B": "Scicom-intl/Multilingual-Expressive-TTS-1.7B", | |
| } | |
| DEFAULT_SPEAKERS = [ | |
| "multilingual-tts_audio_Grace", | |
| "elevenlabs_audio_Alexandr Vlasov - Professional Voiceover", | |
| "multilingual-tts_audio_Domi", | |
| "gemini-flash-2.0-speech_data_audio_kore", | |
| "genshin-voice_audio_Rahman", | |
| "multilingual-tts_audio_Nicole", | |
| "OutteTTS-urdu-dataset_audio_uat_speaker", | |
| ] | |
| SAMPLE_RATE = 24000 | |
| _loaded = {} | |
| codec = None | |
| def load_model(size: str): | |
| if size not in _loaded: | |
| model_name = MODEL_IDS[size] | |
| model = AutoModelForCausalLM.from_pretrained(model_name).cuda() | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| _loaded[size] = (model, tokenizer) | |
| return _loaded[size] | |
| def load_neucodec(): | |
| global codec | |
| if codec is None: | |
| codec = NeuCodec.from_pretrained("neuphonic/neucodec") | |
| _ = codec.eval().to('cuda') | |
| return codec | |
| def generate(speaker_choice: str, custom_speaker: str, | |
| model_size: str, text: str, description: str, temperature: float = 0.8): | |
| # Resolve speaker name | |
| speaker = custom_speaker.strip() if speaker_choice == "Custom..." else speaker_choice | |
| if not speaker: | |
| raise gr.Error("Please enter a custom speaker name.") | |
| if not text.strip(): | |
| raise gr.Error("Please enter some text to synthesize.") | |
| gr.Info("Loading model...") | |
| model, tokenizer = load_model(model_size) | |
| gr.Info("Loading codec...") | |
| codec = load_neucodec() | |
| gr.Info("Generating audio...") | |
| if len(description): | |
| prompt = f"<|im_start|>{speaker}: {text}<|description|>{description}<|speech_start|>" | |
| else: | |
| prompt = f"<|im_start|>{speaker}: {text}<|speech_start|>" | |
| inputs = tokenizer(prompt,return_tensors="pt", add_special_tokens=True).to(model.device) | |
| with torch.no_grad(): | |
| outputs = model.generate( | |
| **inputs, | |
| max_new_tokens=2048, | |
| do_sample=True, | |
| temperature=temperature, | |
| repetition_penalty=1.15, | |
| ) | |
| generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False) | |
| audio_tokens = re.findall(r'<\|s_(\d+)\|>', generated_text.split('<|speech_start|>')[1]) | |
| audio_tokens = [int(token) for token in audio_tokens] | |
| audio_codes = torch.tensor(audio_tokens)[None, None] | |
| with torch.no_grad(): | |
| audio_waveform = codec.decode_code(audio_codes.cuda()) | |
| audio_np = audio_waveform[0, 0].cpu().numpy() | |
| return SAMPLE_RATE, audio_np | |
| # ── UI ──────────────────────────────────────────────────────────────────────── | |
| with gr.Blocks(title="Expressive Multilingual TTS") as demo: | |
| gr.Markdown("""# Expressive Multilingual TTS | |
| A multilingual expressive text-to-speech system available in two sizes: | |
| - **0.6B** — [Scicom-intl/Multilingual-Expressive-TTS-0.6B](https://huggingface.co/Scicom-intl/Multilingual-Expressive-TTS-0.6B) | |
| - **1.7B** — [Scicom-intl/Multilingual-Expressive-TTS-1.7B](https://huggingface.co/Scicom-intl/Multilingual-Expressive-TTS-1.7B) | |
| The model supports **mid-sentence language switching** across many languages in a single utterance, e.g.: | |
| > *Hi nama saya Husein, I am so cute, 我喜欢吃鸡饭, boire du thé glacé, ולהירגע על החוף, وأحب أن أتعرض لبعض أشعة الشمس.* | |
| """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| speaker_dropdown = gr.Dropdown( | |
| choices=DEFAULT_SPEAKERS + ["Custom..."], | |
| value=DEFAULT_SPEAKERS[0], | |
| label="Speaker", | |
| ) | |
| custom_speaker_label = gr.Markdown( | |
| "or you can use any speaker name from " | |
| "[malaysia-ai/Multilingual-TTS](https://huggingface.co/datasets/malaysia-ai/Multilingual-TTS), " | |
| "e.g. `700h-tr-turkish-text-to-speech_audio_0`", | |
| visible=False, | |
| ) | |
| custom_speaker = gr.Textbox( | |
| label="Custom speaker name", | |
| placeholder="Type your own speaker name...", | |
| visible=False, | |
| ) | |
| model_size = gr.Radio( | |
| choices=["0.6B", "1.7B"], | |
| value="0.6B", | |
| label="Model size", | |
| ) | |
| text_input = gr.Textbox( | |
| label="Text", | |
| placeholder="Enter the text to synthesize...", | |
| lines=4, | |
| ) | |
| description_input = gr.Textbox( | |
| label="Description", | |
| info="Optional voice style description. Note: the model's main strength is multilingual — it may not always follow the description precisely.", | |
| placeholder="Describe the voice style, e.g. 'A calm female voice with a slight Malaysian accent'", | |
| lines=3, | |
| ) | |
| temperature = gr.Slider( | |
| minimum=0.5, maximum=1.2, value=0.8, step=0.05, | |
| label="Temperature", | |
| ) | |
| generate_btn = gr.Button("Generate", variant="primary") | |
| with gr.Column(): | |
| audio_output = gr.Audio(label="Output", type="numpy") | |
| # Show/hide custom speaker label + textbox | |
| def toggle_custom(choice): | |
| visible = choice == "Custom..." | |
| return gr.update(visible=visible), gr.update(visible=visible) | |
| speaker_dropdown.change(toggle_custom, inputs=speaker_dropdown, outputs=[custom_speaker_label, custom_speaker]) | |
| generate_btn.click( | |
| fn=generate, | |
| inputs=[speaker_dropdown, custom_speaker, model_size, text_input, description_input, temperature], | |
| outputs=audio_output, | |
| ) | |
| gr.Markdown("*Note: Example texts are translated using Google Translate and may not be accurate — for demo purposes only.*") | |
| gr.Examples( | |
| examples=[ | |
| ["multilingual-tts_audio_Grace", "", "1.7B", "Hi nama saya Husein, I am so cute, 我喜欢吃鸡饭, boire du thé glacé, ולהירגע על החוף, وأحب أن أتعرض لبعض أشعة الشمس, हैलो आज आप कैसे हैं? Здравствуйте, как у вас дела сегодня?", "A warm and friendly female voice.", 0.8], | |
| ["genshin-voice_audio_Rahman", "", "1.7B", "Selamat pagi, apa khabar? صبح بخیر، حال و احوالت چطوره؟, Dzień dobry, jak się masz?", "A calm male voice with a Malaysian accent.", 0.8], | |
| ["multilingual-tts_audio_Domi", "", "1.7B", "The weather is beautiful today, Veðrið er fallegt í dag, 오늘은 날씨가 정말 좋네요, 今日は天気がとても良いです, מזג האוויר יפהפה היום, अद्यत्वे मौसमः सुन्दरः अस्ति.", "An expressive and cheerful male voice.", 0.8], | |
| ], | |
| inputs=[speaker_dropdown, custom_speaker, model_size, text_input, description_input, temperature], | |
| ) | |
| demo.launch() | |