import os, time, urllib.request os.environ["OMP_NUM_THREADS"] = "1" BASE = "https://raw.githubusercontent.com/OpenMOSS/MOSS-TTS-Nano/main" for fn in ["onnx_tts_runtime.py", "ort_cpu_runtime.py", "text_normalization_pipeline.py", "tts_robust_normalizer_single_script.py"]: if not os.path.isfile(fn): urllib.request.urlretrieve(f"{BASE}/{fn}", fn) os.makedirs("moss_tts_nano", exist_ok=True) for sub, content in [ ("__init__.py", '__version__ = "0.1.0"\n'), ("defaults.py", "from pathlib import Path\nREPO_ROOT = Path(__file__).resolve().parents[1]\nDEFAULT_OUTPUT_DIR = REPO_ROOT / 'generated_audio'\n"), ]: if not os.path.isfile(f"moss_tts_nano/{sub}"): with open(f"moss_tts_nano/{sub}", "w") as f: f.write(content) import gradio as gr from onnx_tts_runtime import OnnxTtsRuntime RT = None VOICES = ["Junhao","Zhiming","Weiguo","Xiaoyu","Yuewen","Lingyu","Trump","Ava","Bella","Adam","Nathan","Soyo","Saki","Mortis","Umiri","Mei","Anon","Arisa"] def get_rt(): global RT if RT is None: RT = OnnxTtsRuntime(os.environ.get("MOSS_MODEL_DIR", "models"), thread_count=1, execution_provider="cpu") return RT def synth(text, voice, ref, smode, mf, seed, at, ak, ap, ar): rt = get_rt() gd = rt.manifest["generation_defaults"] gd.update(max_new_frames=int(mf), sample_mode=smode, do_sample=smode!="greedy", audio_temperature=float(at), audio_top_k=int(ak), audio_top_p=float(ap), audio_repetition_penalty=float(ar)) if seed >= 0: import numpy as np rt.rng = np.random.default_rng(int(seed)) t0 = time.time() r = rt.synthesize(text=text, voice=voice if not ref else None, prompt_audio_path=ref if ref else None, sample_mode=smode, streaming=True, max_new_frames=int(mf), enable_wetext=False, enable_normalize_tts_text=False) return r["audio_path"], f"完成 — {time.time()-t0:.1f}秒 | {r['sample_rate']}Hz" with gr.Blocks(title="MOSS-TTS-Nano ONNX", theme=gr.themes.Soft()) as demo: gr.Markdown("## 🗣️ MOSS-TTS-Nano-100M-ONNX\n纯 CPU 离线 TTS,支持语音克隆。首次运行自动下载 ~730MB 模型。") with gr.Row(): with gr.Column(scale=1): text = gr.Textbox(label="合成文本", value="你好,欢迎使用 MOSS 语音合成。", lines=4) with gr.Row(): voice = gr.Dropdown(choices=VOICES, value="Junhao", label="内置声音(上传参考音频后切换为语音克隆)") ref = gr.Audio(label="参考音频(可选)", type="filepath") with gr.Row(): smode = gr.Dropdown(choices=["greedy — 贪心解码","fixed — 内置参数(推荐)","full — 自定义采样"], value="fixed — 内置参数(推荐)", label="采样模式") frames = gr.Slider(16, 750, value=375, step=1, label="最大帧数") seed = gr.Slider(-1, 9999, value=-1, step=1, label="随机种子(-1=随机)") btn = gr.Button("🎤 生成语音", variant="primary", size="lg") with gr.Accordion("⚙️ 高级采样参数(full 模式生效)", open=False): at = gr.Slider(0.1, 2.0, 0.8, 0.05, label="温度 audio_temperature") ak = gr.Slider(1, 100, 25, 1, label="Top-K audio_top_k") ap = gr.Slider(0.1, 1.0, 0.95, 0.05, label="Top-P audio_top_p") ar = gr.Slider(1.0, 2.0, 1.2, 0.05, label="重复惩罚 audio_repetition_penalty") with gr.Column(scale=1): audio_out = gr.Audio(label="生成结果", type="filepath") info = gr.Textbox(label="状态") btn.click(fn=lambda t,v,r,m,f,s,at,ak,ap,ar: synth(t,v,r,m.split(" — ")[0],f,s,at,ak,ap,ar), inputs=[text,voice,ref,smode,frames,seed,at,ak,ap,ar], outputs=[audio_out,info]) if __name__ == "__main__": get_rt() demo.launch(server_name="0.0.0.0",mcp_server=True)