| import os, time, urllib.request |
| os.environ["OMP_NUM_THREADS"] = "1" |
|
|
| BASE = "https://raw.githubusercontent.com/OpenMOSS/MOSS-TTS-Nano/main" |
| for fn in ["onnx_tts_runtime.py", "ort_cpu_runtime.py", |
| "text_normalization_pipeline.py", "tts_robust_normalizer_single_script.py"]: |
| if not os.path.isfile(fn): |
| urllib.request.urlretrieve(f"{BASE}/{fn}", fn) |
|
|
| os.makedirs("moss_tts_nano", exist_ok=True) |
| for sub, content in [ |
| ("__init__.py", '__version__ = "0.1.0"\n'), |
| ("defaults.py", "from pathlib import Path\nREPO_ROOT = Path(__file__).resolve().parents[1]\nDEFAULT_OUTPUT_DIR = REPO_ROOT / 'generated_audio'\n"), |
| ]: |
| if not os.path.isfile(f"moss_tts_nano/{sub}"): |
| with open(f"moss_tts_nano/{sub}", "w") as f: |
| f.write(content) |
|
|
| import gradio as gr |
| from onnx_tts_runtime import OnnxTtsRuntime |
|
|
| RT = None |
| VOICES = ["Junhao","Zhiming","Weiguo","Xiaoyu","Yuewen","Lingyu","Trump","Ava","Bella","Adam","Nathan","Soyo","Saki","Mortis","Umiri","Mei","Anon","Arisa"] |
|
|
| def get_rt(): |
| global RT |
| if RT is None: |
| RT = OnnxTtsRuntime(os.environ.get("MOSS_MODEL_DIR", "models"), thread_count=1, execution_provider="cpu") |
| return RT |
|
|
| def synth(text, voice, ref, smode, mf, seed, at, ak, ap, ar): |
| rt = get_rt() |
| gd = rt.manifest["generation_defaults"] |
| gd.update(max_new_frames=int(mf), sample_mode=smode, do_sample=smode!="greedy", |
| audio_temperature=float(at), audio_top_k=int(ak), audio_top_p=float(ap), |
| audio_repetition_penalty=float(ar)) |
| if seed >= 0: |
| import numpy as np |
| rt.rng = np.random.default_rng(int(seed)) |
| t0 = time.time() |
| r = rt.synthesize(text=text, voice=voice if not ref else None, |
| prompt_audio_path=ref if ref else None, |
| sample_mode=smode, streaming=True, max_new_frames=int(mf), |
| enable_wetext=False, enable_normalize_tts_text=False) |
| return r["audio_path"], f"完成 — {time.time()-t0:.1f}秒 | {r['sample_rate']}Hz" |
|
|
| with gr.Blocks(title="MOSS-TTS-Nano ONNX", theme=gr.themes.Soft()) as demo: |
| gr.Markdown("## 🗣️ MOSS-TTS-Nano-100M-ONNX\n纯 CPU 离线 TTS,支持语音克隆。首次运行自动下载 ~730MB 模型。") |
| with gr.Row(): |
| with gr.Column(scale=1): |
| text = gr.Textbox(label="合成文本", value="你好,欢迎使用 MOSS 语音合成。", lines=4) |
| with gr.Row(): |
| voice = gr.Dropdown(choices=VOICES, value="Junhao", label="内置声音(上传参考音频后切换为语音克隆)") |
| ref = gr.Audio(label="参考音频(可选)", type="filepath") |
| with gr.Row(): |
| smode = gr.Dropdown(choices=["greedy — 贪心解码","fixed — 内置参数(推荐)","full — 自定义采样"], value="fixed — 内置参数(推荐)", label="采样模式") |
| frames = gr.Slider(16, 750, value=375, step=1, label="最大帧数") |
| seed = gr.Slider(-1, 9999, value=-1, step=1, label="随机种子(-1=随机)") |
| btn = gr.Button("🎤 生成语音", variant="primary", size="lg") |
| with gr.Accordion("⚙️ 高级采样参数(full 模式生效)", open=False): |
| at = gr.Slider(0.1, 2.0, 0.8, 0.05, label="温度 audio_temperature") |
| ak = gr.Slider(1, 100, 25, 1, label="Top-K audio_top_k") |
| ap = gr.Slider(0.1, 1.0, 0.95, 0.05, label="Top-P audio_top_p") |
| ar = gr.Slider(1.0, 2.0, 1.2, 0.05, label="重复惩罚 audio_repetition_penalty") |
| with gr.Column(scale=1): |
| audio_out = gr.Audio(label="生成结果", type="filepath") |
| info = gr.Textbox(label="状态") |
| btn.click(fn=lambda t,v,r,m,f,s,at,ak,ap,ar: synth(t,v,r,m.split(" — ")[0],f,s,at,ak,ap,ar), |
| inputs=[text,voice,ref,smode,frames,seed,at,ak,ap,ar], |
| outputs=[audio_out,info]) |
|
|
| if __name__ == "__main__": |
| get_rt() |
| demo.launch(server_name="0.0.0.0",mcp_server=True) |