File size: 4,022 Bytes
b270909 702fcc0 b270909 702fcc0 b270909 93ff498 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 | import os, time, urllib.request
os.environ["OMP_NUM_THREADS"] = "1"
BASE = "https://raw.githubusercontent.com/OpenMOSS/MOSS-TTS-Nano/main"
for fn in ["onnx_tts_runtime.py", "ort_cpu_runtime.py",
"text_normalization_pipeline.py", "tts_robust_normalizer_single_script.py"]:
if not os.path.isfile(fn):
urllib.request.urlretrieve(f"{BASE}/{fn}", fn)
os.makedirs("moss_tts_nano", exist_ok=True)
for sub, content in [
("__init__.py", '__version__ = "0.1.0"\n'),
("defaults.py", "from pathlib import Path\nREPO_ROOT = Path(__file__).resolve().parents[1]\nDEFAULT_OUTPUT_DIR = REPO_ROOT / 'generated_audio'\n"),
]:
if not os.path.isfile(f"moss_tts_nano/{sub}"):
with open(f"moss_tts_nano/{sub}", "w") as f:
f.write(content)
import gradio as gr
from onnx_tts_runtime import OnnxTtsRuntime
RT = None
VOICES = ["Junhao","Zhiming","Weiguo","Xiaoyu","Yuewen","Lingyu","Trump","Ava","Bella","Adam","Nathan","Soyo","Saki","Mortis","Umiri","Mei","Anon","Arisa"]
def get_rt():
global RT
if RT is None:
RT = OnnxTtsRuntime(os.environ.get("MOSS_MODEL_DIR", "models"), thread_count=1, execution_provider="cpu")
return RT
def synth(text, voice, ref, smode, mf, seed, at, ak, ap, ar):
rt = get_rt()
gd = rt.manifest["generation_defaults"]
gd.update(max_new_frames=int(mf), sample_mode=smode, do_sample=smode!="greedy",
audio_temperature=float(at), audio_top_k=int(ak), audio_top_p=float(ap),
audio_repetition_penalty=float(ar))
if seed >= 0:
import numpy as np
rt.rng = np.random.default_rng(int(seed))
t0 = time.time()
r = rt.synthesize(text=text, voice=voice if not ref else None,
prompt_audio_path=ref if ref else None,
sample_mode=smode, streaming=True, max_new_frames=int(mf),
enable_wetext=False, enable_normalize_tts_text=False)
return r["audio_path"], f"完成 — {time.time()-t0:.1f}秒 | {r['sample_rate']}Hz"
with gr.Blocks(title="MOSS-TTS-Nano ONNX", theme=gr.themes.Soft()) as demo:
gr.Markdown("## 🗣️ MOSS-TTS-Nano-100M-ONNX\n纯 CPU 离线 TTS,支持语音克隆。首次运行自动下载 ~730MB 模型。")
with gr.Row():
with gr.Column(scale=1):
text = gr.Textbox(label="合成文本", value="你好,欢迎使用 MOSS 语音合成。", lines=4)
with gr.Row():
voice = gr.Dropdown(choices=VOICES, value="Junhao", label="内置声音(上传参考音频后切换为语音克隆)")
ref = gr.Audio(label="参考音频(可选)", type="filepath")
with gr.Row():
smode = gr.Dropdown(choices=["greedy — 贪心解码","fixed — 内置参数(推荐)","full — 自定义采样"], value="fixed — 内置参数(推荐)", label="采样模式")
frames = gr.Slider(16, 750, value=375, step=1, label="最大帧数")
seed = gr.Slider(-1, 9999, value=-1, step=1, label="随机种子(-1=随机)")
btn = gr.Button("🎤 生成语音", variant="primary", size="lg")
with gr.Accordion("⚙️ 高级采样参数(full 模式生效)", open=False):
at = gr.Slider(0.1, 2.0, 0.8, 0.05, label="温度 audio_temperature")
ak = gr.Slider(1, 100, 25, 1, label="Top-K audio_top_k")
ap = gr.Slider(0.1, 1.0, 0.95, 0.05, label="Top-P audio_top_p")
ar = gr.Slider(1.0, 2.0, 1.2, 0.05, label="重复惩罚 audio_repetition_penalty")
with gr.Column(scale=1):
audio_out = gr.Audio(label="生成结果", type="filepath")
info = gr.Textbox(label="状态")
btn.click(fn=lambda t,v,r,m,f,s,at,ak,ap,ar: synth(t,v,r,m.split(" — ")[0],f,s,at,ak,ap,ar),
inputs=[text,voice,ref,smode,frames,seed,at,ak,ap,ar],
outputs=[audio_out,info])
if __name__ == "__main__":
get_rt()
demo.launch(server_name="0.0.0.0",mcp_server=True) |