| import os, sys, subprocess, numpy as np, gradio as gr |
| from huggingface_hub import snapshot_download |
|
|
| subprocess.run(["git", "clone", "--recursive", "https://github.com/FunAudioLLM/CosyVoice.git", "CosyVoice"], check=True) |
| sys.path.insert(0, "CosyVoice/third_party/Matcha-TTS") |
| sys.path.insert(0, "CosyVoice") |
|
|
| model_dir = snapshot_download("FunAudioLLM/CosyVoice-300M-SFT", local_dir="pretrained_models/CosyVoice-300M-SFT") |
| from cosyvoice.cli.cosyvoice import CosyVoice |
| cosyvoice = CosyVoice(model_dir) |
| spk_list = cosyvoice.list_available_spks() |
|
|
| def tts(text, spk): |
| for result in cosyvoice.inference_sft(text, spk, stream=False): |
| audio = result["tts_speech"].numpy().flatten() |
| return (cosyvoice.sample_rate, audio) |
|
|
| demo = gr.Interface( |
| fn=tts, |
| inputs=[ |
| gr.Textbox(label="Text", value="你好,我是通义生成式语音大模型。"), |
| gr.Dropdown(choices=spk_list, value=spk_list[0], label="Speaker"), |
| ], |
| outputs=gr.Audio(label="Audio"), |
| ) |
| demo.launch(server_name="0.0.0.0", server_port=7860) |