Spaces:
Running on Zero
Running on Zero
| """ | |
| YingMusic Singer - Gradio Web Interface | |
| ======================================== | |
| 基于参考音色与旋律音频的歌声合成系统,支持自动分离人声与伴奏。 | |
| A singing voice synthesis system powered by YingMusicSinger, | |
| with built-in vocal/accompaniment separation via MelBandRoformer. | |
| """ | |
| import gradio as gr | |
| import torch | |
| import torchaudio | |
| import tempfile | |
| import os | |
| import numpy as np | |
| # --------------------------------------------------------------------------- | |
| # Model loading (lazy, singleton) / 模型懒加载(单例) | |
| # --------------------------------------------------------------------------- | |
| _model = None | |
| _separator = None | |
| def get_model(device: str = "cuda:0"): | |
| """加载 YingMusicSinger 模型 / Load YingMusicSinger model.""" | |
| global _model | |
| if _model is None: | |
| from src.YingMusicSinger.infer.YingMusicSinger import YingMusicSinger | |
| _model = YingMusicSinger(device=device) | |
| return _model | |
| def get_separator(device: str = "cuda:0"): | |
| """ | |
| 加载 MelBandRoformer 分离模型 / Load MelBandRoformer separator. | |
| Returns a Separator instance ready for inference. | |
| """ | |
| global _separator | |
| if _separator is None: | |
| from src.third_party.MusicSourceSeparationTraining.inference_api import ( | |
| Separator, | |
| ) | |
| _separator = Separator( | |
| config_path="ckpts/config_vocals_mel_band_roformer_kj.yaml", | |
| checkpoint_path="ckpts/MelBandRoformer.ckpt", | |
| device=device, | |
| ) | |
| return _separator | |
| # --------------------------------------------------------------------------- | |
| # Vocal separation utilities / 人声分离工具 | |
| # --------------------------------------------------------------------------- | |
| def separate_vocals( | |
| audio_path: str, | |
| device: str = "cuda:0", | |
| ) -> tuple: | |
| """ | |
| 使用 MelBandRoformer 将音频分离为人声和伴奏。 | |
| Separate audio into vocals and accompaniment using MelBandRoformer. | |
| Returns: | |
| (vocals_path, accompaniment_path) | |
| """ | |
| separator = get_separator(device=device) | |
| wav, sr = torchaudio.load(audio_path) | |
| vocal_wav, inst_wav, out_sr = separator.separate(wav, sr) | |
| tmp_dir = tempfile.mkdtemp() | |
| vocals_path = os.path.join(tmp_dir, "vocals.wav") | |
| accomp_path = os.path.join(tmp_dir, "accompaniment.wav") | |
| torchaudio.save(vocals_path, torch.from_numpy(vocal_wav), out_sr) | |
| torchaudio.save(accomp_path, torch.from_numpy(inst_wav), out_sr) | |
| return vocals_path, accomp_path | |
| def mix_vocal_and_accompaniment( | |
| vocal_path: str, | |
| accomp_path: str, | |
| vocal_gain: float = 1.0, | |
| ) -> str: | |
| """ | |
| 将合成人声与伴奏混合为最终音频。 | |
| Mix synthesised vocals with accompaniment into a final audio file. | |
| """ | |
| vocal_wav, vocal_sr = torchaudio.load(vocal_path) | |
| accomp_wav, accomp_sr = torchaudio.load(accomp_path) | |
| # 统一采样率至人声采样率 / Resample accompaniment to match vocal sample rate | |
| if accomp_sr != vocal_sr: | |
| accomp_wav = torchaudio.functional.resample(accomp_wav, accomp_sr, vocal_sr) | |
| # 统一声道数 / Match channel count | |
| if vocal_wav.shape[0] != accomp_wav.shape[0]: | |
| if vocal_wav.shape[0] == 1: | |
| vocal_wav = vocal_wav.expand(accomp_wav.shape[0], -1) | |
| else: | |
| accomp_wav = accomp_wav.expand(vocal_wav.shape[0], -1) | |
| # 对齐长度(以较短者为准)/ Align to shorter length | |
| min_len = min(vocal_wav.shape[1], accomp_wav.shape[1]) | |
| vocal_wav = vocal_wav[:, :min_len] | |
| accomp_wav = accomp_wav[:, :min_len] | |
| mixed = vocal_wav * vocal_gain + accomp_wav | |
| # 防止 clipping / Prevent clipping | |
| peak = mixed.abs().max() | |
| if peak > 1.0: | |
| mixed = mixed / peak | |
| out_path = os.path.join(tempfile.mkdtemp(), "mixed_output.wav") | |
| torchaudio.save(out_path, mixed, sample_rate=vocal_sr) | |
| return out_path | |
| # --------------------------------------------------------------------------- | |
| # Inference wrapper / 推理入口 | |
| # --------------------------------------------------------------------------- | |
| def synthesize( | |
| ref_audio, | |
| melody_audio, | |
| ref_text, | |
| target_text, | |
| separate_vocals_flag, | |
| mix_accompaniment_flag, | |
| sil_len_to_end, | |
| t_shift, | |
| nfe_step, | |
| cfg_strength, | |
| seed, | |
| ): | |
| """ | |
| 主合成流程 / Main synthesis pipeline. | |
| 1. (可选) 用 MelBandRoformer 分离参考音频和旋律音频的人声与伴奏 | |
| (Optional) Separate vocals & accompaniment from both ref and melody audio via MelBandRoformer | |
| 2. 送入 YingMusicSinger 合成 | |
| Run YingMusicSinger inference | |
| 3. (可选) 将合成人声与旋律音频的伴奏混合 | |
| (Optional) Mix synthesised vocals with melody accompaniment | |
| """ | |
| import random | |
| # ---- 输入校验 / Input validation ----------------------------------------- | |
| if ref_audio is None: | |
| raise gr.Error("请上传参考音频 / Please upload Reference Audio") | |
| if melody_audio is None: | |
| raise gr.Error("请上传旋律音频 / Please upload Melody Audio") | |
| if not ref_text.strip(): | |
| raise gr.Error("请输入参考音频对应的歌词 / Please enter Reference Text") | |
| if not target_text.strip(): | |
| raise gr.Error("请输入目标合成歌词 / Please enter Target Text") | |
| ref_audio_path = ref_audio if isinstance(ref_audio, str) else ref_audio[0] | |
| melody_audio_path = ( | |
| melody_audio if isinstance(melody_audio, str) else melody_audio[0] | |
| ) | |
| # seed = -1 means random / seed 为 -1 时随机生成 | |
| actual_seed = int(seed) | |
| if actual_seed < 0: | |
| actual_seed = random.randint(0, 2**31 - 1) | |
| device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
| # ---- Step 1: 人声分离(参考音频 + 旋律音频)/ Vocal separation for both (optional) ---- | |
| melody_accomp_path = None | |
| actual_ref_path = ref_audio_path | |
| actual_melody_path = melody_audio_path | |
| if separate_vocals_flag: | |
| # 分离参考音频 / Separate reference audio | |
| ref_vocals_path, _ = separate_vocals(ref_audio_path, device=device) | |
| actual_ref_path = ref_vocals_path | |
| # 分离旋律音频 / Separate melody audio | |
| melody_vocals_path, melody_accomp_path = separate_vocals( | |
| melody_audio_path, device=device | |
| ) | |
| actual_melody_path = melody_vocals_path | |
| # ---- Step 2: 模型推理 / Model inference ---------------------------------- | |
| model = get_model(device=device) | |
| audio_tensor, sr = model( | |
| ref_audio_path=actual_ref_path, | |
| melody_audio_path=actual_melody_path, | |
| ref_text=ref_text.strip(), | |
| target_text=target_text.strip(), | |
| lrc_align_mode="sentence_level", | |
| sil_len_to_end=float(sil_len_to_end), | |
| t_shift=float(t_shift), | |
| nfe_step=int(nfe_step), | |
| cfg_strength=float(cfg_strength), | |
| seed=actual_seed, | |
| ) | |
| # 先保存纯人声合成结果 / Save raw vocal synthesis result | |
| vocal_out_path = os.path.join(tempfile.mkdtemp(), "vocal_output.wav") | |
| torchaudio.save(vocal_out_path, audio_tensor, sample_rate=sr) | |
| # ---- Step 3: 混合伴奏 / Mix accompaniment (optional) --------------------- | |
| if ( | |
| separate_vocals_flag | |
| and mix_accompaniment_flag | |
| and melody_accomp_path is not None | |
| ): | |
| final_path = mix_vocal_and_accompaniment(vocal_out_path, melody_accomp_path) | |
| return final_path | |
| else: | |
| return vocal_out_path | |
| # --------------------------------------------------------------------------- | |
| # Custom CSS / 自定义样式 | |
| # --------------------------------------------------------------------------- | |
| CUSTOM_CSS = """ | |
| @import url('https://fonts.googleapis.com/css2?family=DM+Sans:ital,opsz,wght@0,9..40,300;0,9..40,500;0,9..40,700;1,9..40,400&family=Playfair+Display:wght@600;800&display=swap'); | |
| :root { | |
| --primary: #e85d04; | |
| --primary-light: #f48c06; | |
| --bg-dark: #0d1117; | |
| --surface: #161b22; | |
| --surface-light: #21262d; | |
| --text: #f0f6fc; | |
| --text-muted: #8b949e; | |
| --accent-glow: rgba(232, 93, 4, 0.15); | |
| --border: #30363d; | |
| } | |
| .gradio-container { | |
| font-family: 'DM Sans', sans-serif !important; | |
| max-width: 1100px !important; | |
| margin: auto !important; | |
| } | |
| /* ---------- Header / 头部 ---------- */ | |
| #app-header { | |
| text-align: center; | |
| padding: 2.5rem 1rem 1.5rem; | |
| } | |
| #app-header h1 { | |
| font-family: 'Playfair Display', serif !important; | |
| font-size: 2.6rem !important; | |
| font-weight: 800 !important; | |
| background: linear-gradient(135deg, #f48c06, #e85d04, #dc2f02); | |
| -webkit-background-clip: text; | |
| -webkit-text-fill-color: transparent; | |
| margin-bottom: 0.3rem !important; | |
| letter-spacing: -0.02em; | |
| } | |
| #app-header p { | |
| color: var(--text-muted); | |
| font-size: 1.05rem; | |
| margin-top: 0; | |
| } | |
| /* ---------- Section labels / 分区标题 ---------- */ | |
| .section-title { | |
| font-family: 'DM Sans', sans-serif !important; | |
| font-weight: 700 !important; | |
| font-size: 1rem !important; | |
| letter-spacing: 0.06em; | |
| text-transform: uppercase; | |
| color: var(--primary-light) !important; | |
| border-bottom: 2px solid var(--primary); | |
| padding-bottom: 6px; | |
| margin-bottom: 12px !important; | |
| } | |
| /* ---------- Run button / 合成按钮 ---------- */ | |
| #run-btn { | |
| background: linear-gradient(135deg, #e85d04, #dc2f02) !important; | |
| border: none !important; | |
| color: #fff !important; | |
| font-weight: 700 !important; | |
| font-size: 1.1rem !important; | |
| letter-spacing: 0.04em; | |
| padding: 12px 0 !important; | |
| border-radius: 10px !important; | |
| transition: transform 0.15s, box-shadow 0.25s !important; | |
| box-shadow: 0 4px 20px rgba(232, 93, 4, 0.35) !important; | |
| } | |
| #run-btn:hover { | |
| transform: translateY(-1px) !important; | |
| box-shadow: 0 6px 28px rgba(232, 93, 4, 0.5) !important; | |
| } | |
| /* ---------- Output audio / 输出音频 ---------- */ | |
| #output-audio { | |
| border: 2px solid var(--primary) !important; | |
| border-radius: 12px !important; | |
| background: var(--accent-glow) !important; | |
| } | |
| """ | |
| # --------------------------------------------------------------------------- | |
| # Build the Gradio UI / 构建界面 | |
| # --------------------------------------------------------------------------- | |
| def build_ui(): | |
| with gr.Blocks( | |
| css=CUSTOM_CSS, title="YingMusic Singer", theme=gr.themes.Base() | |
| ) as demo: | |
| # ---- Header / 头部 ---- | |
| gr.HTML( | |
| """ | |
| <div id="app-header"> | |
| <h1>♫ YingMusic Singer</h1> | |
| <p>基于参考音色与旋律音频的歌声合成系统 · Singing Voice Synthesis</p> | |
| </div> | |
| """ | |
| ) | |
| # ================================================================ | |
| # ROW 1 – 音频输入 / Audio Inputs + 歌词 / Lyrics (side by side) | |
| # ================================================================ | |
| with gr.Row(equal_height=True): | |
| # ---- 左栏:音频上传 / Left column: audio uploads ---- | |
| with gr.Column(scale=1): | |
| gr.Markdown( | |
| "#### 🎙️ 音频输入 / Audio Inputs", elem_classes="section-title" | |
| ) | |
| ref_audio = gr.Audio( | |
| label="参考音频 / Reference Audio(提供音色 / provides timbre)", | |
| type="filepath", | |
| ) | |
| melody_audio = gr.Audio( | |
| label="旋律音频 / Melody Audio(提供旋律与时长 / provides melody & duration)", | |
| type="filepath", | |
| ) | |
| # ---- 右栏:歌词 / Right column: lyrics ---- | |
| with gr.Column(scale=1): | |
| gr.Markdown("#### ✏️ 歌词 / Lyrics", elem_classes="section-title") | |
| ref_text = gr.Textbox( | |
| label="参考音频歌词 / Reference Text", | |
| placeholder="例如 / e.g.:该体谅的不执着|如果那天我", | |
| lines=5, | |
| ) | |
| target_text = gr.Textbox( | |
| label="目标合成歌词 / Target Text", | |
| placeholder="例如 / e.g.:好多天|看不完你", | |
| lines=5, | |
| ) | |
| # ================================================================ | |
| # ROW 2 – 伴奏分离选项 / Vocal Separation Options | |
| # ================================================================ | |
| gr.Markdown("#### 🎚️ 伴奏分离 / Vocal Separation", elem_classes="section-title") | |
| with gr.Row(): | |
| separate_vocals_flag = gr.Checkbox( | |
| value=True, | |
| label="分离人声后过模型 / Separate vocals before synthesis", | |
| info=( | |
| "从参考音频和旋律音频中分离人声,仅用人声送入模型 / " | |
| "Extract vocals from both reference and melody audio before feeding into the model" | |
| ), | |
| ) | |
| mix_accompaniment_flag = gr.Checkbox( | |
| value=False, | |
| interactive=True, | |
| label="输出时混入伴奏 / Mix accompaniment into output", | |
| info=( | |
| "将合成人声与分离出的伴奏混合输出(需先开启人声分离)/ " | |
| "Mix synthesised vocals with separated accompaniment (requires vocal separation)" | |
| ), | |
| ) | |
| # ================================================================ | |
| # ROW 3 – 高级参数 / Advanced Parameters (collapsible) | |
| # ================================================================ | |
| with gr.Accordion("⚙️ 高级参数 / Advanced Parameters", open=False): | |
| with gr.Row(): | |
| nfe_step = gr.Slider( | |
| minimum=4, | |
| maximum=128, | |
| value=32, | |
| step=1, | |
| label="采样步数 / NFE Steps", | |
| info="更多步数 = 更高质量,但更慢 / More steps = higher quality, but slower", | |
| ) | |
| cfg_strength = gr.Slider( | |
| minimum=0.0, | |
| maximum=10.0, | |
| value=3.0, | |
| step=0.1, | |
| label="CFG 强度 / CFG Strength", | |
| info="Classifier‑Free Guidance 强度 / Classifier‑Free Guidance strength", | |
| ) | |
| t_shift = gr.Slider( | |
| minimum=0.0, | |
| maximum=1.0, | |
| value=0.5, | |
| step=0.01, | |
| label="采样时间偏移 / t‑shift", | |
| ) | |
| with gr.Row(): | |
| sil_len_to_end = gr.Slider( | |
| minimum=0.0, | |
| maximum=3.0, | |
| value=0.5, | |
| step=0.1, | |
| label="末尾静音时长 (秒) / Silence Padding (s)", | |
| info="参考音频末尾追加的静音 / Silence appended after reference audio", | |
| ) | |
| seed = gr.Number( | |
| value=-1, | |
| precision=0, | |
| label="随机种子 / Random Seed", | |
| info="-1 表示随机 / -1 means random", | |
| ) | |
| # ================================================================ | |
| # ROW 4 – 合成按钮与输出 / Run & Output | |
| # ================================================================ | |
| run_btn = gr.Button("🎤 开始合成 / Synthesize", elem_id="run-btn", size="lg") | |
| output_audio = gr.Audio( | |
| label="合成结果 / Generated Audio", | |
| type="filepath", | |
| elem_id="output-audio", | |
| ) | |
| # ---- 联动:未开启分离时,禁用伴奏混合 ---- | |
| # ---- Disable mix checkbox when separation is off ---- | |
| separate_vocals_flag.change( | |
| fn=lambda sep: gr.update( | |
| interactive=sep, value=False if not sep else False | |
| ), | |
| inputs=[separate_vocals_flag], | |
| outputs=[mix_accompaniment_flag], | |
| ) | |
| # ---- 绑定事件 / Wire up ---- | |
| run_btn.click( | |
| fn=synthesize, | |
| inputs=[ | |
| ref_audio, | |
| melody_audio, | |
| ref_text, | |
| target_text, | |
| separate_vocals_flag, | |
| mix_accompaniment_flag, | |
| sil_len_to_end, | |
| t_shift, | |
| nfe_step, | |
| cfg_strength, | |
| seed, | |
| ], | |
| outputs=output_audio, | |
| ) | |
| # ---- 页脚 / Footer ---- | |
| gr.Markdown( | |
| """ | |
| --- | |
| <center style="color:#8b949e; font-size:0.85rem;"> | |
| YingMusic Singer · 基于 Flow Matching + VAE / Powered by Flow Matching + VAE · | |
| 用 <code>|</code> 分隔歌词中的乐句 / Use <code>|</code> to separate phrases in lyrics | |
| </center> | |
| """, | |
| ) | |
| return demo | |
| # --------------------------------------------------------------------------- | |
| # Entry point / 启动入口 | |
| # --------------------------------------------------------------------------- | |
| if __name__ == "__main__": | |
| demo = build_ui() | |
| demo.queue() | |
| demo.launch(server_name="0.0.0.0", server_port=7860, share=False) | |