import os, sys, subprocess, numpy as np, gradio as gr from huggingface_hub import snapshot_download subprocess.run(["git", "clone", "--recursive", "https://github.com/FunAudioLLM/CosyVoice.git", "CosyVoice"], check=True) sys.path.insert(0, "CosyVoice/third_party/Matcha-TTS") sys.path.insert(0, "CosyVoice") model_dir = snapshot_download("FunAudioLLM/CosyVoice-300M-SFT", local_dir="pretrained_models/CosyVoice-300M-SFT") from cosyvoice.cli.cosyvoice import CosyVoice cosyvoice = CosyVoice(model_dir) spk_list = cosyvoice.list_available_spks() def tts(text, spk): for result in cosyvoice.inference_sft(text, spk, stream=False): audio = result["tts_speech"].numpy().flatten() return (cosyvoice.sample_rate, audio) demo = gr.Interface( fn=tts, inputs=[ gr.Textbox(label="Text", value="你好,我是通义生成式语音大模型。"), gr.Dropdown(choices=spk_list, value=spk_list[0], label="Speaker"), ], outputs=gr.Audio(label="Audio"), ) demo.launch(server_name="0.0.0.0", server_port=7860)