xjsc0's picture
11
dae4a7c
raw
history blame
17.5 kB
"""
YingMusic Singer - Gradio Web Interface
========================================
基于参考音色与旋律音频的歌声合成系统,支持自动分离人声与伴奏。
A singing voice synthesis system powered by YingMusicSinger,
with built-in vocal/accompaniment separation via MelBandRoformer.
"""
import gradio as gr
import torch
import torchaudio
import tempfile
import os
import numpy as np
# ---------------------------------------------------------------------------
# Model loading (lazy, singleton) / 模型懒加载(单例)
# ---------------------------------------------------------------------------
_model = None
_separator = None
def get_model(device: str = "cuda:0"):
"""加载 YingMusicSinger 模型 / Load YingMusicSinger model."""
global _model
if _model is None:
from src.YingMusicSinger.infer.YingMusicSinger import YingMusicSinger
_model = YingMusicSinger(device=device)
return _model
def get_separator(device: str = "cuda:0"):
"""
加载 MelBandRoformer 分离模型 / Load MelBandRoformer separator.
Returns a Separator instance ready for inference.
"""
global _separator
if _separator is None:
from src.third_party.MusicSourceSeparationTraining.inference_api import (
Separator,
)
_separator = Separator(
config_path="ckpts/config_vocals_mel_band_roformer_kj.yaml",
checkpoint_path="ckpts/MelBandRoformer.ckpt",
device=device,
)
return _separator
# ---------------------------------------------------------------------------
# Vocal separation utilities / 人声分离工具
# ---------------------------------------------------------------------------
def separate_vocals(
audio_path: str,
device: str = "cuda:0",
) -> tuple:
"""
使用 MelBandRoformer 将音频分离为人声和伴奏。
Separate audio into vocals and accompaniment using MelBandRoformer.
Returns:
(vocals_path, accompaniment_path)
"""
separator = get_separator(device=device)
wav, sr = torchaudio.load(audio_path)
vocal_wav, inst_wav, out_sr = separator.separate(wav, sr)
tmp_dir = tempfile.mkdtemp()
vocals_path = os.path.join(tmp_dir, "vocals.wav")
accomp_path = os.path.join(tmp_dir, "accompaniment.wav")
torchaudio.save(vocals_path, torch.from_numpy(vocal_wav), out_sr)
torchaudio.save(accomp_path, torch.from_numpy(inst_wav), out_sr)
return vocals_path, accomp_path
def mix_vocal_and_accompaniment(
vocal_path: str,
accomp_path: str,
vocal_gain: float = 1.0,
) -> str:
"""
将合成人声与伴奏混合为最终音频。
Mix synthesised vocals with accompaniment into a final audio file.
"""
vocal_wav, vocal_sr = torchaudio.load(vocal_path)
accomp_wav, accomp_sr = torchaudio.load(accomp_path)
# 统一采样率至人声采样率 / Resample accompaniment to match vocal sample rate
if accomp_sr != vocal_sr:
accomp_wav = torchaudio.functional.resample(accomp_wav, accomp_sr, vocal_sr)
# 统一声道数 / Match channel count
if vocal_wav.shape[0] != accomp_wav.shape[0]:
if vocal_wav.shape[0] == 1:
vocal_wav = vocal_wav.expand(accomp_wav.shape[0], -1)
else:
accomp_wav = accomp_wav.expand(vocal_wav.shape[0], -1)
# 对齐长度(以较短者为准)/ Align to shorter length
min_len = min(vocal_wav.shape[1], accomp_wav.shape[1])
vocal_wav = vocal_wav[:, :min_len]
accomp_wav = accomp_wav[:, :min_len]
mixed = vocal_wav * vocal_gain + accomp_wav
# 防止 clipping / Prevent clipping
peak = mixed.abs().max()
if peak > 1.0:
mixed = mixed / peak
out_path = os.path.join(tempfile.mkdtemp(), "mixed_output.wav")
torchaudio.save(out_path, mixed, sample_rate=vocal_sr)
return out_path
# ---------------------------------------------------------------------------
# Inference wrapper / 推理入口
# ---------------------------------------------------------------------------
def synthesize(
ref_audio,
melody_audio,
ref_text,
target_text,
separate_vocals_flag,
mix_accompaniment_flag,
sil_len_to_end,
t_shift,
nfe_step,
cfg_strength,
seed,
):
"""
主合成流程 / Main synthesis pipeline.
1. (可选) 用 MelBandRoformer 分离参考音频和旋律音频的人声与伴奏
(Optional) Separate vocals & accompaniment from both ref and melody audio via MelBandRoformer
2. 送入 YingMusicSinger 合成
Run YingMusicSinger inference
3. (可选) 将合成人声与旋律音频的伴奏混合
(Optional) Mix synthesised vocals with melody accompaniment
"""
import random
# ---- 输入校验 / Input validation -----------------------------------------
if ref_audio is None:
raise gr.Error("请上传参考音频 / Please upload Reference Audio")
if melody_audio is None:
raise gr.Error("请上传旋律音频 / Please upload Melody Audio")
if not ref_text.strip():
raise gr.Error("请输入参考音频对应的歌词 / Please enter Reference Text")
if not target_text.strip():
raise gr.Error("请输入目标合成歌词 / Please enter Target Text")
ref_audio_path = ref_audio if isinstance(ref_audio, str) else ref_audio[0]
melody_audio_path = (
melody_audio if isinstance(melody_audio, str) else melody_audio[0]
)
# seed = -1 means random / seed 为 -1 时随机生成
actual_seed = int(seed)
if actual_seed < 0:
actual_seed = random.randint(0, 2**31 - 1)
device = "cuda:0" if torch.cuda.is_available() else "cpu"
# ---- Step 1: 人声分离(参考音频 + 旋律音频)/ Vocal separation for both (optional) ----
melody_accomp_path = None
actual_ref_path = ref_audio_path
actual_melody_path = melody_audio_path
if separate_vocals_flag:
# 分离参考音频 / Separate reference audio
ref_vocals_path, _ = separate_vocals(ref_audio_path, device=device)
actual_ref_path = ref_vocals_path
# 分离旋律音频 / Separate melody audio
melody_vocals_path, melody_accomp_path = separate_vocals(
melody_audio_path, device=device
)
actual_melody_path = melody_vocals_path
# ---- Step 2: 模型推理 / Model inference ----------------------------------
model = get_model(device=device)
audio_tensor, sr = model(
ref_audio_path=actual_ref_path,
melody_audio_path=actual_melody_path,
ref_text=ref_text.strip(),
target_text=target_text.strip(),
lrc_align_mode="sentence_level",
sil_len_to_end=float(sil_len_to_end),
t_shift=float(t_shift),
nfe_step=int(nfe_step),
cfg_strength=float(cfg_strength),
seed=actual_seed,
)
# 先保存纯人声合成结果 / Save raw vocal synthesis result
vocal_out_path = os.path.join(tempfile.mkdtemp(), "vocal_output.wav")
torchaudio.save(vocal_out_path, audio_tensor, sample_rate=sr)
# ---- Step 3: 混合伴奏 / Mix accompaniment (optional) ---------------------
if (
separate_vocals_flag
and mix_accompaniment_flag
and melody_accomp_path is not None
):
final_path = mix_vocal_and_accompaniment(vocal_out_path, melody_accomp_path)
return final_path
else:
return vocal_out_path
# ---------------------------------------------------------------------------
# Custom CSS / 自定义样式
# ---------------------------------------------------------------------------
CUSTOM_CSS = """
@import url('https://fonts.googleapis.com/css2?family=DM+Sans:ital,opsz,wght@0,9..40,300;0,9..40,500;0,9..40,700;1,9..40,400&family=Playfair+Display:wght@600;800&display=swap');
:root {
--primary: #e85d04;
--primary-light: #f48c06;
--bg-dark: #0d1117;
--surface: #161b22;
--surface-light: #21262d;
--text: #f0f6fc;
--text-muted: #8b949e;
--accent-glow: rgba(232, 93, 4, 0.15);
--border: #30363d;
}
.gradio-container {
font-family: 'DM Sans', sans-serif !important;
max-width: 1100px !important;
margin: auto !important;
}
/* ---------- Header / 头部 ---------- */
#app-header {
text-align: center;
padding: 2.5rem 1rem 1.5rem;
}
#app-header h1 {
font-family: 'Playfair Display', serif !important;
font-size: 2.6rem !important;
font-weight: 800 !important;
background: linear-gradient(135deg, #f48c06, #e85d04, #dc2f02);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
margin-bottom: 0.3rem !important;
letter-spacing: -0.02em;
}
#app-header p {
color: var(--text-muted);
font-size: 1.05rem;
margin-top: 0;
}
/* ---------- Section labels / 分区标题 ---------- */
.section-title {
font-family: 'DM Sans', sans-serif !important;
font-weight: 700 !important;
font-size: 1rem !important;
letter-spacing: 0.06em;
text-transform: uppercase;
color: var(--primary-light) !important;
border-bottom: 2px solid var(--primary);
padding-bottom: 6px;
margin-bottom: 12px !important;
}
/* ---------- Run button / 合成按钮 ---------- */
#run-btn {
background: linear-gradient(135deg, #e85d04, #dc2f02) !important;
border: none !important;
color: #fff !important;
font-weight: 700 !important;
font-size: 1.1rem !important;
letter-spacing: 0.04em;
padding: 12px 0 !important;
border-radius: 10px !important;
transition: transform 0.15s, box-shadow 0.25s !important;
box-shadow: 0 4px 20px rgba(232, 93, 4, 0.35) !important;
}
#run-btn:hover {
transform: translateY(-1px) !important;
box-shadow: 0 6px 28px rgba(232, 93, 4, 0.5) !important;
}
/* ---------- Output audio / 输出音频 ---------- */
#output-audio {
border: 2px solid var(--primary) !important;
border-radius: 12px !important;
background: var(--accent-glow) !important;
}
"""
# ---------------------------------------------------------------------------
# Build the Gradio UI / 构建界面
# ---------------------------------------------------------------------------
def build_ui():
with gr.Blocks(
css=CUSTOM_CSS, title="YingMusic Singer", theme=gr.themes.Base()
) as demo:
# ---- Header / 头部 ----
gr.HTML(
"""
<div id="app-header">
<h1>♫ YingMusic Singer</h1>
<p>基于参考音色与旋律音频的歌声合成系统 &nbsp;·&nbsp; Singing Voice Synthesis</p>
</div>
"""
)
# ================================================================
# ROW 1 – 音频输入 / Audio Inputs + 歌词 / Lyrics (side by side)
# ================================================================
with gr.Row(equal_height=True):
# ---- 左栏:音频上传 / Left column: audio uploads ----
with gr.Column(scale=1):
gr.Markdown(
"#### 🎙️ 音频输入 / Audio Inputs", elem_classes="section-title"
)
ref_audio = gr.Audio(
label="参考音频 / Reference Audio(提供音色 / provides timbre)",
type="filepath",
)
melody_audio = gr.Audio(
label="旋律音频 / Melody Audio(提供旋律与时长 / provides melody & duration)",
type="filepath",
)
# ---- 右栏:歌词 / Right column: lyrics ----
with gr.Column(scale=1):
gr.Markdown("#### ✏️ 歌词 / Lyrics", elem_classes="section-title")
ref_text = gr.Textbox(
label="参考音频歌词 / Reference Text",
placeholder="例如 / e.g.:该体谅的不执着|如果那天我",
lines=5,
)
target_text = gr.Textbox(
label="目标合成歌词 / Target Text",
placeholder="例如 / e.g.:好多天|看不完你",
lines=5,
)
# ================================================================
# ROW 2 – 伴奏分离选项 / Vocal Separation Options
# ================================================================
gr.Markdown("#### 🎚️ 伴奏分离 / Vocal Separation", elem_classes="section-title")
with gr.Row():
separate_vocals_flag = gr.Checkbox(
value=True,
label="分离人声后过模型 / Separate vocals before synthesis",
info=(
"从参考音频和旋律音频中分离人声,仅用人声送入模型 / "
"Extract vocals from both reference and melody audio before feeding into the model"
),
)
mix_accompaniment_flag = gr.Checkbox(
value=False,
interactive=True,
label="输出时混入伴奏 / Mix accompaniment into output",
info=(
"将合成人声与分离出的伴奏混合输出(需先开启人声分离)/ "
"Mix synthesised vocals with separated accompaniment (requires vocal separation)"
),
)
# ================================================================
# ROW 3 – 高级参数 / Advanced Parameters (collapsible)
# ================================================================
with gr.Accordion("⚙️ 高级参数 / Advanced Parameters", open=False):
with gr.Row():
nfe_step = gr.Slider(
minimum=4,
maximum=128,
value=32,
step=1,
label="采样步数 / NFE Steps",
info="更多步数 = 更高质量,但更慢 / More steps = higher quality, but slower",
)
cfg_strength = gr.Slider(
minimum=0.0,
maximum=10.0,
value=3.0,
step=0.1,
label="CFG 强度 / CFG Strength",
info="Classifier‑Free Guidance 强度 / Classifier‑Free Guidance strength",
)
t_shift = gr.Slider(
minimum=0.0,
maximum=1.0,
value=0.5,
step=0.01,
label="采样时间偏移 / t‑shift",
)
with gr.Row():
sil_len_to_end = gr.Slider(
minimum=0.0,
maximum=3.0,
value=0.5,
step=0.1,
label="末尾静音时长 (秒) / Silence Padding (s)",
info="参考音频末尾追加的静音 / Silence appended after reference audio",
)
seed = gr.Number(
value=-1,
precision=0,
label="随机种子 / Random Seed",
info="-1 表示随机 / -1 means random",
)
# ================================================================
# ROW 4 – 合成按钮与输出 / Run & Output
# ================================================================
run_btn = gr.Button("🎤 开始合成 / Synthesize", elem_id="run-btn", size="lg")
output_audio = gr.Audio(
label="合成结果 / Generated Audio",
type="filepath",
elem_id="output-audio",
)
# ---- 联动:未开启分离时,禁用伴奏混合 ----
# ---- Disable mix checkbox when separation is off ----
separate_vocals_flag.change(
fn=lambda sep: gr.update(
interactive=sep, value=False if not sep else False
),
inputs=[separate_vocals_flag],
outputs=[mix_accompaniment_flag],
)
# ---- 绑定事件 / Wire up ----
run_btn.click(
fn=synthesize,
inputs=[
ref_audio,
melody_audio,
ref_text,
target_text,
separate_vocals_flag,
mix_accompaniment_flag,
sil_len_to_end,
t_shift,
nfe_step,
cfg_strength,
seed,
],
outputs=output_audio,
)
# ---- 页脚 / Footer ----
gr.Markdown(
"""
---
<center style="color:#8b949e; font-size:0.85rem;">
YingMusic Singer &nbsp;·&nbsp; 基于 Flow Matching + VAE / Powered by Flow Matching + VAE &nbsp;·&nbsp;
用 <code>|</code> 分隔歌词中的乐句 / Use <code>|</code> to separate phrases in lyrics
</center>
""",
)
return demo
# ---------------------------------------------------------------------------
# Entry point / 启动入口
# ---------------------------------------------------------------------------
if __name__ == "__main__":
demo = build_ui()
demo.queue()
demo.launch(server_name="0.0.0.0", server_port=7860, share=False)