""" YingMusic Singer - Gradio Web Interface ======================================== 基于参考音色与旋律音频的歌声合成系统,支持自动分离人声与伴奏。 A singing voice synthesis system powered by YingMusicSinger, with built-in vocal/accompaniment separation via MelBandRoformer. """ try: import spaces except ImportError: spaces = None import os import tempfile import gradio as gr import torch import torchaudio from initialization import download_files IS_HF_SPACE = os.environ.get("SPACE_ID") is not None HF_ENABLE = False LOCAL_DEVICE = "cuda" if torch.cuda.is_available() else "cpu" def local_move2gpu(x): """Move models to GPU on local environment. No-op on HuggingFace Spaces (ZeroGPU handles it).""" if IS_HF_SPACE: return x return x.to(LOCAL_DEVICE) # --------------------------------------------------------------------------- # Model loading (lazy, singleton) / 模型懒加载(单例) # --------------------------------------------------------------------------- _model = None _separator = None def _load_model_impl(): """Internal: load YingMusicSinger (no GPU decorator, called inside GPU context).""" download_files(task="infer") global _model if _model is None: from src.YingMusicSinger.infer.YingMusicSinger import YingMusicSinger _model = YingMusicSinger.from_pretrained("ASLP-lab/YingMusic-Singer") _model = local_move2gpu(_model) _model.eval() return _model def _load_separator_impl(): """Internal: load MelBandRoformer separator (no GPU decorator, called inside GPU context).""" download_files(task="infer") global _separator if _separator is None: from src.third_party.MusicSourceSeparationTraining.inference_api import Separator _separator = Separator( config_path="ckpts/config_vocals_mel_band_roformer_kj.yaml", checkpoint_path="ckpts/MelBandRoformer.ckpt", ) return _separator # --------------------------------------------------------------------------- # Vocal separation utilities / 人声分离工具 # --------------------------------------------------------------------------- def _separate_vocals_impl(audio_path: str) -> tuple: """ Separate audio into vocals and accompaniment using MelBandRoformer. Must be called within an active GPU context. """ separator = _load_separator_impl() wav, sr = torchaudio.load(audio_path) vocal_wav, inst_wav, out_sr = separator.separate(wav, sr) tmp_dir = tempfile.mkdtemp() vocals_path = os.path.join(tmp_dir, "vocals.wav") accomp_path = os.path.join(tmp_dir, "accompaniment.wav") torchaudio.save(vocals_path, torch.from_numpy(vocal_wav), out_sr) torchaudio.save(accomp_path, torch.from_numpy(inst_wav), out_sr) return vocals_path, accomp_path def mix_vocal_and_accompaniment( vocal_path: str, accomp_path: str, vocal_gain: float = 1.0, ) -> str: """ 将合成人声与伴奏混合为最终音频。 Mix synthesised vocals with accompaniment into a final audio file. """ vocal_wav, vocal_sr = torchaudio.load(vocal_path) accomp_wav, accomp_sr = torchaudio.load(accomp_path) if accomp_sr != vocal_sr: accomp_wav = torchaudio.functional.resample(accomp_wav, accomp_sr, vocal_sr) if vocal_wav.shape[0] != accomp_wav.shape[0]: if vocal_wav.shape[0] == 1: vocal_wav = vocal_wav.expand(accomp_wav.shape[0], -1) else: accomp_wav = accomp_wav.expand(vocal_wav.shape[0], -1) min_len = min(vocal_wav.shape[1], accomp_wav.shape[1]) vocal_wav = vocal_wav[:, :min_len] accomp_wav = accomp_wav[:, :min_len] mixed = vocal_wav * vocal_gain + accomp_wav peak = mixed.abs().max() if peak > 1.0: mixed = mixed / peak out_path = os.path.join(tempfile.mkdtemp(), "mixed_output.wav") torchaudio.save(out_path, mixed, sample_rate=vocal_sr) return out_path # --------------------------------------------------------------------------- # Inference wrapper / 推理入口 # Single @spaces.GPU scope covers ALL heavy work (separation + synthesis) # so models stay resident in GPU memory across steps within one call. # --------------------------------------------------------------------------- @spaces.GPU def synthesize( ref_audio, melody_audio, ref_text, target_text, separate_vocals_flag, mix_accompaniment_flag, sil_len_to_end, t_shift, nfe_step, cfg_strength, seed, ): """ 主合成流程 / Main synthesis pipeline. 1. (可选) 用 MelBandRoformer 分离参考音频和旋律音频的人声与伴奏 2. 送入 YingMusicSinger 合成 3. (可选) 将合成人声与旋律音频的伴奏混合 """ import random # ---- 输入校验 / Input validation ---------------------------------------- if ref_audio is None: raise gr.Error("请上传参考音频 / Please upload Reference Audio") if melody_audio is None: raise gr.Error("请上传旋律音频 / Please upload Melody Audio") if not ref_text.strip(): raise gr.Error("请输入参考音频对应的歌词 / Please enter Reference Text") if not target_text.strip(): raise gr.Error("请输入目标合成歌词 / Please enter Target Text") ref_audio_path = ref_audio if isinstance(ref_audio, str) else ref_audio[0] melody_audio_path = ( melody_audio if isinstance(melody_audio, str) else melody_audio[0] ) actual_seed = int(seed) if actual_seed < 0: actual_seed = random.randint(0, 2**31 - 1) # ---- Step 1: 人声分离(合并在同一 GPU 上下文中)/ Vocal separation (same GPU context) ---- melody_accomp_path = None actual_ref_path = ref_audio_path actual_melody_path = melody_audio_path if separate_vocals_flag: ref_vocals_path, _ = _separate_vocals_impl(ref_audio_path) actual_ref_path = ref_vocals_path melody_vocals_path, melody_accomp_path = _separate_vocals_impl(melody_audio_path) actual_melody_path = melody_vocals_path # ---- Step 2: 模型推理 / Model inference (same GPU context) --------------- model = _load_model_impl() audio_tensor, sr = model( ref_audio_path=actual_ref_path, melody_audio_path=actual_melody_path, ref_text=ref_text.strip(), target_text=target_text.strip(), lrc_align_mode="sentence_level", sil_len_to_end=float(sil_len_to_end), t_shift=float(t_shift), nfe_step=int(nfe_step), cfg_strength=float(cfg_strength), seed=actual_seed, ) vocal_out_path = os.path.join(tempfile.mkdtemp(), "vocal_output.wav") torchaudio.save(vocal_out_path, audio_tensor.to("cpu"), sample_rate=sr) # ---- Step 3: 混合伴奏 / Mix accompaniment (optional) --------------------- if ( separate_vocals_flag and mix_accompaniment_flag and melody_accomp_path is not None ): final_path = mix_vocal_and_accompaniment(vocal_out_path, melody_accomp_path) return final_path else: return vocal_out_path # --------------------------------------------------------------------------- # Example presets / 预设示例 # --------------------------------------------------------------------------- EXAMPLES_MELODY_CONTROL = [ # [ref_audio, melody_audio, ref_text, target_text, sep, mix, sil, t_shift, nfe, cfg, seed] [ "examples/melody_control/ref_01.wav", "examples/melody_control/melody_01.wav", "该体谅的不执着|如果那天我", "好多天|看不完你", True, False, 0.5, 0.5, 32, 3.0, -1, ], [ "examples/melody_control/ref_02.wav", "examples/melody_control/melody_02.wav", "月光下的身影|渐渐模糊", "星光照亮前路|指引方向", True, False, 0.5, 0.5, 32, 3.0, -1, ], ] EXAMPLES_LYRIC_EDIT = [ [ "examples/lyric_edit/ref_01.wav", "examples/lyric_edit/melody_01.wav", "该体谅的不执着|如果那天我", "忘不掉的笑容|留在心里面", True, False, 0.5, 0.5, 32, 3.0, -1, ], [ "examples/lyric_edit/ref_02.wav", "examples/lyric_edit/melody_02.wav", "夜深了还不睡|想着你的脸", "春风又吹过来|带走我思念", True, False, 0.5, 0.5, 32, 3.0, -1, ], ] # --------------------------------------------------------------------------- # Custom CSS / 自定义样式 # --------------------------------------------------------------------------- CUSTOM_CSS = """ @import url('https://fonts.googleapis.com/css2?family=DM+Sans:ital,opsz,wght@0,9..40,300;0,9..40,500;0,9..40,700;1,9..40,400&family=Playfair+Display:wght@600;800&display=swap'); :root { --primary: #e85d04; --primary-light: #f48c06; --bg-dark: #0d1117; --surface: #161b22; --surface-light: #21262d; --text: #f0f6fc; --text-muted: #8b949e; --accent-glow: rgba(232, 93, 4, 0.15); --border: #30363d; } .gradio-container { font-family: 'DM Sans', sans-serif !important; max-width: 1100px !important; margin: auto !important; } /* ---------- Badge links: no underline, no gap artifacts ---------- */ #app-header .badges a { text-decoration: none !important; display: inline-block; line-height: 0; margin: 3px 2px; } #app-header .badges a img, #app-header .badges > img { display: inline-block; vertical-align: middle; margin: 0; } #app-header .badges { line-height: 1.8; } /* ---------- Header / 头部 ---------- */ #app-header { text-align: center; padding: 1.8rem 1rem 0.5rem; } #app-header h1 { font-size: 1.45rem !important; font-weight: 700 !important; line-height: 1.4; margin-bottom: 0.6rem !important; } #app-header .badges img { display: inline-block; margin: 3px 2px; vertical-align: middle; } #app-header .authors { color: var(--text-muted); font-size: 0.92rem; margin: 0.5rem 0 0.2rem; line-height: 1.7; } #app-header .affiliations { color: var(--text-muted); font-size: 0.85rem; margin-bottom: 0.5rem; } #app-header .lang-links a { color: var(--primary-light); text-decoration: none; margin: 0 4px; font-size: 0.9rem; } #app-header .lang-links a:hover { text-decoration: underline; } /* ---------- Disclaimer ---------- */ #disclaimer { border-top: 1px solid var(--border); margin: 24px 0 4px; padding: 14px 4px 4px; font-size: 0.80rem; color: #6e7681; line-height: 1.65; } #disclaimer strong { color: #8b949e; font-weight: 600; } /* ---------- Section labels / 分区标题 ---------- */ .section-title { font-family: 'DM Sans', sans-serif !important; font-weight: 700 !important; font-size: 1rem !important; letter-spacing: 0.06em; text-transform: uppercase; color: var(--primary-light) !important; border-bottom: 2px solid var(--primary); padding-bottom: 6px; margin-bottom: 12px !important; } /* ---------- Example tabs ---------- */ .example-tab-label { font-weight: 600 !important; font-size: 0.95rem !important; } /* ---------- Run button / 合成按钮 ---------- */ #run-btn { background: linear-gradient(135deg, #e85d04, #dc2f02) !important; border: none !important; color: #fff !important; font-weight: 700 !important; font-size: 1.1rem !important; letter-spacing: 0.04em; padding: 12px 0 !important; border-radius: 10px !important; transition: transform 0.15s, box-shadow 0.25s !important; box-shadow: 0 4px 20px rgba(232, 93, 4, 0.35) !important; } #run-btn:hover { transform: translateY(-1px) !important; box-shadow: 0 6px 28px rgba(232, 93, 4, 0.5) !important; } /* ---------- Output audio / 输出音频 ---------- */ #output-audio { border: 2px solid var(--primary) !important; border-radius: 12px !important; background: var(--accent-glow) !important; } """ # --------------------------------------------------------------------------- # Header HTML / 头部 HTML # --------------------------------------------------------------------------- HEADER_HTML = """
1 Northwestern Polytechnical University · 2 Giant Network