import argparse
import gradio as gr
import json
import os
import sys
import hashlib
import time
import math
import re
import zipfile
from typing import Any, List, Dict, Optional, Tuple
from pathlib import Path
import numpy as np
from pydub import AudioSegment
from diffsinger_utau.voice_bank import PredAll
from diffsinger_utau.voice_bank.commons.ds_reader import DSReader
from diffsinger_utau.voice_bank.commons.phome_num_counter import Phome
from pypinyin import pinyin, Style
from pypinyin.constants import RE_HANS
# —— 文本预处理:相邻纯汉字不加空格,其余保留空格 ——
def _is_hans_token(s: str) -> bool:
try:
return bool(RE_HANS and RE_HANS.fullmatch(s))
except Exception:
return False
def preprocess_zh_spaces(text: str) -> str:
parts = [p for p in (text or "").split(" ") if p != ""]
if not parts:
return ""
out = []
for i, part in enumerate(parts):
if i == 0:
out.append(part)
else:
prev = parts[i - 1]
if _is_hans_token(prev) and _is_hans_token(part):
out[-1] = out[-1] + part
else:
out.append(" " + part)
return "".join(out)
def validate_lyric_format(modified_text: str, original_text: str) -> Tuple[bool, str]:
"""
校验歌词格式是否与原始文本匹配
返回: (是否匹配, 渲染后的原始文本或空字符串)
"""
if not original_text:
return True, ""
# 去掉空格后比较
modified_clean = re.sub(r'\s+', '', modified_text)
original_clean = re.sub(r'\s+', '', original_text)
# 长度检查
if len(modified_clean) != len(original_clean):
return False, render_original_with_highlights(original_text, modified_text)
# AP/SP 位置检查
modified_ap_sp_positions = []
original_ap_sp_positions = []
# 找到修改后文本中的 AP/SP 位置
for match in re.finditer(r'\b(AP|SP)\b', modified_text):
modified_ap_sp_positions.append((match.start(), match.group()))
# 找到原始文本中的 AP/SP 位置
for match in re.finditer(r'\b(AP|SP)\b', original_text):
original_ap_sp_positions.append((match.start(), match.group()))
# 比较 AP/SP 的数量和类型
if len(modified_ap_sp_positions) != len(original_ap_sp_positions):
return False, render_original_with_highlights(original_text, modified_text)
# 检查每个 AP/SP 的相对位置是否一致
for (mod_pos, mod_type), (orig_pos, orig_type) in zip(modified_ap_sp_positions, original_ap_sp_positions):
if mod_type != orig_type:
return False, render_original_with_highlights(original_text, modified_text)
# 计算相对位置(在去空格后的字符串中)
mod_relative_pos = len(re.sub(r'\s+', '', modified_text[:mod_pos]))
orig_relative_pos = len(re.sub(r'\s+', '', original_text[:orig_pos]))
if mod_relative_pos != orig_relative_pos:
return False, render_original_with_highlights(original_text, modified_text)
return True, ""
def render_original_with_highlights(original_text: str, modified_text: str) -> str:
"""
渲染原始文本,用灰色字体显示,位置不一致的 AP/SP 用红色标记
"""
# 找到修改后和原始文本中的 AP/SP 位置
modified_ap_sp = set()
original_ap_sp = set()
for match in re.finditer(r'\b(AP|SP)\b', modified_text):
pos = len(re.sub(r'\s+', '', modified_text[:match.start()]))
modified_ap_sp.add((pos, match.group()))
result_parts = []
i = 0
clean_pos = 0
while i < len(original_text):
# 检查当前位置是否是 AP 或 SP
if original_text[i:i+2] in ['AP', 'SP'] and (i == 0 or not original_text[i-1].isalnum()) and (i+2 >= len(original_text) or not original_text[i+2].isalnum()):
ap_sp = original_text[i:i+2]
# 检查这个 AP/SP 在修改后的文本中是否在相同位置
if (clean_pos, ap_sp) not in modified_ap_sp:
result_parts.append(f'{ap_sp}')
else:
result_parts.append(ap_sp)
i += 2
clean_pos += 2
elif original_text[i].isspace():
result_parts.append(original_text[i])
i += 1
else:
result_parts.append(original_text[i])
i += 1
clean_pos += 1
return f'{"".join(result_parts)}'
# 试图导入 diffsinger-utau(按要求使用该库,而非自行实现)
try:
import diffsinger_utau # 类型: 忽略
except Exception as e:
diffsinger_utau = None
ROOT = Path(__file__).parent.resolve()
MODELS_DIR = ROOT / "models"
PUBLIC_TEMPLATES_DIR = ROOT / "templates" / "public"
USER_TEMPLATES_DIR = ROOT / "templates" / "user"
OUTPUT_DIR = ROOT / "output" / "pred_all"
CACHE_DIR = ROOT / "cache"
AUDIO_EXTS = [".wav", ".mp3", ".flac", ".m4a", ".ogg"]
# 预创建最大可编辑句子数,避免在事件中动态创建组件
MAX_LINES = 200
def ensure_dirs():
for p in [MODELS_DIR, PUBLIC_TEMPLATES_DIR, USER_TEMPLATES_DIR, OUTPUT_DIR, CACHE_DIR]:
p.mkdir(parents=True, exist_ok=True)
def list_model_choices() -> List[str]:
# 模型以目录名为选择项;也允许单文件模型
choices = []
if MODELS_DIR.exists():
for p in sorted(MODELS_DIR.iterdir()):
if p.is_dir():
choices.append(str(p.relative_to(ROOT)))
elif p.is_file():
# 单文件权重
choices.append(str(p.relative_to(ROOT)))
return choices
def find_templates() -> Dict[str, Path]:
"""
返回 {模板名(不含扩展名): 模板路径}
用户目录覆盖公开目录
"""
results: Dict[str, Path] = {}
# 先公开
if PUBLIC_TEMPLATES_DIR.exists():
for p in PUBLIC_TEMPLATES_DIR.glob("*.ds"):
results[p.stem] = p
# 后用户(覆盖)
if USER_TEMPLATES_DIR.exists():
for p in USER_TEMPLATES_DIR.glob("*.ds"):
results[p.stem] = p
return results
def bgm_path_for(template_path: Path) -> Optional[Path]:
base = template_path.with_suffix("")
for ext in AUDIO_EXTS:
cand = Path(str(base) + ext)
if cand.exists():
return cand
return None
def load_ds(template_path: Path):
ds = DSReader(template_path).read_ds()
return ds
def audiosegment_from_file(path: Path) -> AudioSegment:
return AudioSegment.from_file(str(path))
def export_wav(seg: AudioSegment, path: Path):
path.parent.mkdir(parents=True, exist_ok=True)
seg.export(str(path), format="wav")
def overlay_bgm_snippet(vocal_wav: Path, bgm_audio: AudioSegment, offset_sec: float, bgm_volume: float = 0.3, vocal_gain_db: float = 0.0) -> AudioSegment:
vocal = audiosegment_from_file(vocal_wav)
if vocal_gain_db != 0.0:
vocal = vocal + vocal_gain_db
start_ms = max(int(offset_sec * 1000), 0)
# 应用 BGM 音量倍率
if bgm_volume <= 0.0:
base = AudioSegment.silent(duration=start_ms + len(vocal))
else:
gain_db = 20.0 * math.log10(bgm_volume)
bgm_audio = bgm_audio + gain_db
base = bgm_audio
# 确保底轨足够长
if len(base) < start_ms + len(vocal):
pad_ms = start_ms + len(vocal) - len(base)
base = base + AudioSegment.silent(duration=pad_ms)
mixed = base.overlay(vocal, position=start_ms)
return mixed[start_ms : start_ms + len(vocal)] # pyright: ignore[reportReturnType]
def concat_with_offsets(clips: List[Tuple[AudioSegment, float]]) -> AudioSegment:
# 根据 offset 将多个片段放置在时间线上,输出包含至最大结束时间
if not clips:
return AudioSegment.silent(duration=0)
max_end_ms = 0
for seg, offset in clips:
start = int(max(offset, 0) * 1000)
max_end_ms = max(max_end_ms, start + len(seg))
timeline = AudioSegment.silent(duration=max_end_ms)
for seg, offset in clips:
start = int(max(offset, 0) * 1000)
timeline = timeline.overlay(seg, position=start)
return timeline
def mix_full_song(vocal: AudioSegment, bgm: AudioSegment, bgm_volume: float = 0.3) -> AudioSegment:
# 保证两者同长度
if len(bgm) < len(vocal):
bgm = bgm + AudioSegment.silent(duration=(len(vocal) - len(bgm)))
else:
vocal = vocal + AudioSegment.silent(duration=(len(bgm) - len(vocal)))
# 应用 BGM 音量倍率
if bgm_volume <= 0.0:
bgm_adj = AudioSegment.silent(duration=len(bgm))
else:
gain_db = 20.0 * math.log10(bgm_volume)
bgm_adj = bgm + gain_db
return bgm_adj.overlay(vocal)
def generate_ai_prompt(lyrics_text: str) -> str:
"""生成AI歌词的prompt"""
processed_lyrics = preprocess_zh_spaces(lyrics_text) if lyrics_text else ""
prompt = f"""这是原始歌词:
```txt
{processed_lyrics}
```
其中SP和AP分别代表停顿和呼吸。你应该保留原始格式,然后按照要求替换歌词。
保留原始格式的意思是每句歌词字数应该保持不变。
比如 "AP 试着 SP 掬一把星辰 SP 在手心 SP" 修改为 "AP 天空 SP 赤色的晚霞 SP 刚散去 SP" 就是符合要求的。如果有多字、少字或者AP, SP位置不对,都是不符合要求的。
现在请帮我基于上述原始歌词模板,写一首歌曲《历史的进程推着人前进》,主题为:个人奋斗固然重要,但是历史进程更加浩浩汤汤。"""
return prompt
def apply_ai_lyrics(ai_lyrics: str, original_lyrics: str) -> Tuple[str, str]:
"""应用AI生成的歌词到右侧文本框"""
if not ai_lyrics or not ai_lyrics.strip():
return original_lyrics, "请先输入回填歌词"
# 分割歌词为行,保留空行
ai_lines = [line.strip() for line in ai_lyrics.split('\n')]
original_lines = [line.strip() for line in original_lyrics.split('\n')]
# 检查行数是否一致
if len(ai_lines) != len(original_lines):
return original_lyrics, f"行数不匹配:AI歌词有{len(ai_lines)}行,原始歌词有{len(original_lines)}行"
# 逐行替换
new_lyrics = '\n'.join(ai_lines)
return new_lyrics, "歌词应用成功!"
def param_hash(model_sel: str, speaker: str, key_shift: int, steps: int, text: str) -> str:
s = json.dumps(
{
"model": model_sel,
"speaker": speaker or "",
"key_shift": int(key_shift),
"steps": int(steps),
"text": text or "",
},
ensure_ascii=False,
separators=(",", ":"),
sort_keys=True,
)
return hashlib.md5(s.encode("utf-8")).hexdigest()[:16]
class DSUEngine:
"""
基于 voice_bank.PredAll 的推理引擎;。
"""
def __init__(self):
self.available = PredAll is not None and DSReader is not None
self.predictors: Dict[str, PredAll] = {} # model_path -> PredAll 实例
def is_ready(self) -> bool:
return self.available
def _get_predictor(self, model_path: Path):
key = str(model_path.resolve())
if key not in self.predictors:
self.predictors[key] = PredAll(Path(key))
return self.predictors[key]
def synth_line(
self,
model_path: Path,
template_path: Path,
line_index: int,
text: str,
speaker: Optional[str],
key_shift: int,
steps: int,
out_wav: Path,
) -> None:
if self.available:
predictor = self._get_predictor(model_path)
# 读取 ds,并替换目标行文本与必要的音素
ds_list = DSReader(template_path).read_ds()
if not (0 <= line_index < len(ds_list)):
raise IndexError("行索引越界")
ds = ds_list[line_index]
ds.replace(text)
# 选择说话人
spk = speaker
try:
if (not spk) and getattr(predictor, "available_speakers", None):
av = predictor.available_speakers
if isinstance(av, (list, tuple)) and len(av) > 0:
spk = av[0]
except Exception:
pass
out_wav.parent.mkdir(parents=True, exist_ok=True)
results = predictor.predict_full_pipeline(
ds=ds,
lang="zh",
speaker=spk,
key_shift=int(key_shift),
pitch_steps=10,
variance_steps=10,
acoustic_steps=int(steps),
gender=0.0,
output_dir=str(out_wav.parent),
save_intermediate=False,
)
# 拷贝/重命名输出为指定文件名
audio_path = results.get("audio_path") if isinstance(results, dict) else None
if not audio_path:
raise RuntimeError("predict_full_pipeline 未返回 audio_path")
src = Path(audio_path)
if src.resolve() != out_wav.resolve():
if src.exists():
src.replace(out_wav)
if not out_wav.exists() or out_wav.stat().st_size == 0:
raise RuntimeError("未能生成音频文件")
else:
pass
engine = DSUEngine()
def get_template_choices_and_bgm_visible():
mapping = find_templates()
names = sorted(mapping.keys())
# 根据当前选择动态决定 BGM 开关可见与否,默认False由前端逻辑控制
return names
def on_select_template(template_name: str):
mapping = find_templates()
if not template_name or template_name not in mapping:
return gr.update(visible=False, value=0.0), [], []
p = mapping[template_name]
bgm = bgm_path_for(p)
ds = load_ds(p)
lines = [preprocess_zh_spaces(item.get("text", "")) for item in ds]
offsets = [float(item.get("offset", 0.0)) for item in ds]
bgm_update = gr.update(visible=(bgm is not None), value=(0.3 if bgm is not None else 0.0))
return bgm_update, lines, offsets
def render_single_line(
model_sel: str,
template_name: str,
line_index: int,
new_text: str,
speaker: str,
key_shift: int,
steps: int,
bgm_volume: float,
) -> Tuple[Optional[str], Optional[str]]:
"""
渲染单句,返回:(音频路径, 错误消息)
"""
try:
if not model_sel:
return None, "请先选择模型"
mapping = find_templates()
if template_name not in mapping:
return None, "未找到模板"
template_path = mapping[template_name]
ds = load_ds(template_path)
if not (0 <= line_index < len(ds)):
return None, "行索引越界"
# 更新该句的文本(仅用于本次渲染;不写回文件)
text = new_text.strip()
if not text:
return None, "文本为空"
# 缓存键
h = param_hash(model_sel, speaker, key_shift, steps, text)
cache_dir = OUTPUT_DIR / template_name / h
cache_dir.mkdir(parents=True, exist_ok=True)
wav_out = cache_dir / f"line_{line_index+1}.wav"
if not wav_out.exists():
# 执行推理
model_path = ROOT / model_sel
engine.synth_line(
model_path=model_path,
template_path=template_path,
line_index=line_index,
text=text,
speaker=speaker or None,
key_shift=int(key_shift),
steps=int(steps),
out_wav=wav_out,
)
# 是否混音预览
mapping_bgm = bgm_path_for(template_path)
if (bgm_volume and bgm_volume > 0.0) and mapping_bgm and mapping_bgm.exists():
bgm_audio = audiosegment_from_file(mapping_bgm)
offset = float(ds[line_index].get("offset", 0.0))
mixed_seg = overlay_bgm_snippet(wav_out, bgm_audio, offset, bgm_volume)
preview_wav = cache_dir / f"line_{line_index+1}_preview.wav"
export_wav(mixed_seg, preview_wav)
return str(preview_wav), None
return str(wav_out), None
except Exception as e:
return None, f"{type(e).__name__}: {e}"
def generate_full_song(
model_sel: str,
template_name: str,
lines: List[str],
speaker: str,
key_shift: int,
steps: int,
bgm_volume: float,
) -> Tuple[Optional[str], Optional[str]]:
"""
生成整曲,返回:(主输出音频路径, 混音输出音频路径或错误消息字符串)
"""
try:
if not model_sel:
return None, "请先选择模型"
mapping = find_templates()
if template_name not in mapping:
return None, "未找到模板"
template_path = mapping[template_name]
ds = load_ds(template_path)
if len(ds) != len(lines):
return None, "模板行数与编辑行数不一致"
# 对每句生成或复用缓存
segs_with_offsets: List[Tuple[AudioSegment, float]] = []
model_path = ROOT / model_sel
for idx, item in enumerate(ds):
text = (lines[idx] or "").strip()
if not text:
continue
h = param_hash(model_sel, speaker, key_shift, steps, text)
cache_dir = OUTPUT_DIR / template_name / h
cache_dir.mkdir(parents=True, exist_ok=True)
wav_out = cache_dir / f"line_{idx+1}.wav"
if not wav_out.exists():
engine.synth_line(
model_path=model_path,
template_path=template_path,
line_index=idx,
text=text,
speaker=speaker or None,
key_shift=int(key_shift),
steps=int(steps),
out_wav=wav_out,
)
segs_with_offsets.append((audiosegment_from_file(wav_out), float(item.get("offset", 0.0))))
# 时间线拼接为全曲人声
vocal_full = concat_with_offsets(segs_with_offsets)
final_dir = OUTPUT_DIR / template_name / "final"
final_dir.mkdir(parents=True, exist_ok=True)
ts_tag = time.strftime("%Y%m%d_%H%M%S")
vocal_path = final_dir / f"{template_name}_vocal_{ts_tag}.wav"
export_wav(vocal_full, vocal_path)
# 混音版本(如有 BGM 且开启开关)
mixed_path = None
bgm_p = bgm_path_for(template_path)
if (bgm_volume and bgm_volume > 0.0) and bgm_p and bgm_p.exists():
bgm_audio = audiosegment_from_file(bgm_p)
mixed = mix_full_song(vocal_full, bgm_audio, bgm_volume)
mixed_path = final_dir / f"{template_name}_mixed_{ts_tag}.wav"
export_wav(mixed, mixed_path)
return str(vocal_path), (str(mixed_path) if mixed_path else None)
except Exception as e:
return None, f"{type(e).__name__}: {e}"
def save_uploaded_template(file_obj) -> str:
"""
将用户上传的 ds 模板保存到 templates/user,并返回模板名。
"""
try:
if file_obj is None:
raise ValueError("未选择文件")
src = Path(file_obj.name)
if src.suffix.lower() != ".ds":
raise ValueError("仅支持 .ds 模板文件")
USER_TEMPLATES_DIR.mkdir(parents=True, exist_ok=True)
dst = USER_TEMPLATES_DIR / src.name
# 读取并简单校验
data = json.loads(Path(file_obj.name).read_text(encoding="utf-8"))
if not isinstance(data, list):
raise ValueError("ds 文件格式错误:顶层必须是 list")
dst.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
return src.stem
except Exception as e:
return f"ERROR::{type(e).__name__}: {e}"
def build_ui():
ensure_dirs()
model_choices = list_model_choices()
template_names = get_template_choices_and_bgm_visible()
css = """
/* 全局:启用页面整体滚动,移除左右分栏独立滚动 */
#main-row { gap: 12px; }
#left-panel, #right-panel {
padding: 12px;
}
#left-panel { border-right: 1px solid #eee; }
/* 响应式:窄屏下上下布局,宽屏左右布局 */
@media (max-width: 900px) {
#main-row { flex-direction: column !important; }
#left-panel { border-right: none; border-bottom: 1px solid #eee; }
}
/* 紧凑按钮样式 */
.compact-btn button { padding: 4px 10px !important; min-height: 30px !important; height: 30px !important; }
.compact-row { gap: 8px !important; }
"""
with gr.Blocks(title="DiffSinger WebUI", theme=gr.themes.Soft(), css=css, head='') as demo:
with gr.Row(elem_id="main-row"):
# 左栏:控制/预览(固定)
with gr.Column(elem_id="left-panel", scale=1, min_width=360):
# 左栏标题与模型/模板选择、上传/下载
gr.Markdown("## DiffSinger WebUI")
gr.Markdown("项目地址: [https://github.com/bingcheng1998/diffsinger-webui](https://github.com/bingcheng1998/diffsinger-webui)")
model_sel = gr.Dropdown(choices=model_choices, label="模型选择", value=(model_choices[0] if model_choices else None))
template_sel = gr.Dropdown(choices=template_names, label="模板选择", value=(template_names[0] if template_names else None))
with gr.Row(elem_classes=["compact-row"]):
upload = gr.UploadButton("上传ds模板", file_types=[".ds"], elem_classes=["compact-btn"])
download_btn = gr.DownloadButton(label="下载ds&lrc", elem_classes=["compact-btn"])
with gr.Row():
bgm_volume = gr.Slider(0.0, 2.0, value=0.3, step=0.01, label="BGM音量", visible=False)
key_shift = gr.Slider(-12, 12, value=0, step=1, label="音高偏移")
steps = gr.Slider(1, 50, value=4, step=1, label="渲染步数")
speaker = gr.Dropdown(label="演唱者", choices=[], value=None, interactive=True)
# 单句预览与错误提示
per_line_audio = gr.Audio(label="单句预览", autoplay=True, interactive=False)
per_line_error = gr.Markdown("", visible=False)
# 生成控制与输出
gen_btn = gr.Button("生成整首")
progress_md = gr.Markdown("", visible=True)
full_vocal = gr.Audio(label="整首(人声)", autoplay=False)
full_mixed = gr.Audio(label="整首(混音)", autoplay=False, visible=False)
# AI歌词功能
with gr.Accordion("AI歌词", open=False):
gr.Markdown("点击按钮生成prompt并复制,然后到大模型APP粘贴,将结果回填到下方文本框")
# 使用Gradio 4.44.0的内置复制功能
ai_prompt_display = gr.Textbox(
label="AI歌词Prompt",
lines=8,
max_lines=12,
interactive=False,
placeholder="点击下方按钮生成prompt..."
)
with gr.Row():
generate_prompt_btn = gr.Button("生成Prompt", variant="primary")
copy_prompt_btn = gr.Button("📋 复制", variant="secondary")
ai_lyrics_input = gr.Textbox(
label="回填歌词",
lines=10,
max_lines=15,
placeholder="请将大模型生成的歌词粘贴到这里..."
)
apply_lyrics_btn = gr.Button("应用歌词", variant="primary")
# 右栏:模板与歌词编辑
with gr.Column(elem_id="right-panel", scale=2):
# 状态与歌词编辑容器(右栏仅歌词编辑)
lines_state = gr.State([])
offsets_state = gr.State([])
# 生成控制状态
stop_flag = gr.State(False)
generating_flag = gr.State(False)
dyn = gr.Column()
# 预创建文本框和错误提示,依据当前模板设置初始可见性和值
textboxes = []
error_markdowns = []
original_lines_state = gr.State([]) # 存储原始歌词用于校验
init_lines = []
init_offsets = []
if template_sel.value:
try:
_, init_lines, init_offsets = on_select_template(template_sel.value)
except Exception:
init_lines, init_offsets = [], []
with dyn:
for i in range(MAX_LINES):
visible = i < len(init_lines)
val = init_lines[i] if visible else ""
tb = gr.Textbox(value=val, label=f"第 {i+1} 句", lines=1, max_lines=1, visible=visible)
textboxes.append(tb)
# 为每个文本框添加对应的错误提示
error_md = gr.Markdown("", visible=False)
error_markdowns.append(error_md)
if template_sel.value:
lines_state.value = init_lines
offsets_state.value = init_offsets
original_lines_state.value = init_lines.copy()
# 事件:选择模板时,更新 BGM 开关、整首混音可见性与文本框内容
def on_template_change(template_name):
bgm_update, lines, offsets = on_select_template(template_name)
has_bgm = bool(bgm_update.get("visible", False)) if isinstance(bgm_update, dict) else False
tb_updates = []
error_updates = []
n = len(lines)
for i, tb in enumerate(textboxes):
if i < n:
tb_updates.append(gr.update(value=lines[i], visible=True))
error_updates.append(gr.update(value="", visible=False))
else:
tb_updates.append(gr.update(value="", visible=False))
error_updates.append(gr.update(value="", visible=False))
# 返回:BGM、模板下拉、状态、错误清空、整首混音可见性(按是否存在BGM),原始歌词状态,以及所有文本框和错误提示更新
return (
bgm_update,
gr.update(choices=get_template_choices_and_bgm_visible(), value=template_name),
lines,
offsets,
gr.update(value="", visible=False),
gr.update(visible=has_bgm),
lines.copy(),
*tb_updates,
*error_updates,
)
# 模型切换:动态更新 speaker 下拉项
def on_model_change(model_path_rel):
try:
if not model_path_rel:
return gr.update(choices=[], value=None, interactive=False)
model_path = ROOT / model_path_rel
choices = []
if getattr(engine, "available", False) and model_path.exists():
predictor = engine._get_predictor(model_path)
av = getattr(predictor, "available_speakers", None)
if isinstance(av, (list, tuple)):
choices = list(av)
if choices:
return gr.update(choices=choices, value=choices[0], interactive=True)
else:
return gr.update(choices=[], value=None, interactive=False)
except Exception:
return gr.update(choices=[], value=None, interactive=False)
model_sel.change(
fn=on_model_change,
inputs=[model_sel],
outputs=[speaker],
)
# BGM 音量倍率变化:控制“整首(混音)”可见性(需当前模板存在 BGM 且倍率>0)
def on_bgm_volume_change(bgm_vol, template_name):
mapping = find_templates()
has_bgm = False
if template_name in mapping:
has_bgm = bgm_path_for(mapping[template_name]) is not None
return gr.update(visible=bool(bgm_vol and bgm_vol > 0 and has_bgm))
bgm_volume.change(
fn=on_bgm_volume_change,
inputs=[bgm_volume, template_sel],
outputs=[full_mixed],
)
# 模板切换:批量更新预创建文本框 + 初始化整首混音可见性
template_sel.change(
fn=on_template_change,
inputs=[template_sel],
outputs=[bgm_volume, template_sel, lines_state, offsets_state, per_line_error, full_mixed, original_lines_state, *textboxes, *error_markdowns],
)
# 上传模板:将用户 .ds 保存到 templates/user,并刷新模板下拉
def on_upload_ds(file_obj):
try:
if not file_obj:
raise gr.Error("未选择文件")
# gr.UploadButton 返回字典/路径,兼容不同返回
import shutil
from pathlib import Path as _P
src = _P(file_obj.name) if hasattr(file_obj, "name") else _P(str(file_obj))
if src.suffix.lower() != ".ds":
raise gr.Error("仅支持 .ds 文件")
dst_dir = USER_TEMPLATES_DIR
dst_dir.mkdir(parents=True, exist_ok=True)
dst = dst_dir / src.name
shutil.copyfile(src, dst)
# 刷新模板列表,并选中新上传的模板名(无扩展)
new_choices = get_template_choices_and_bgm_visible()
base = src.stem
# 若同名覆盖,模板名以去扩展后的相对名为准
# 确认在 choices 中(user 覆盖 public)
if base not in new_choices and (dst_dir / src.name).exists():
# 有些实现是用完整相对路径名,这里退化为重新计算 choices
pass
return gr.update(choices=new_choices, value=base)
except Exception as e:
raise gr.Error(f"上传失败: {e}")
upload.upload(
fn=on_upload_ds,
inputs=[upload],
outputs=[template_sel],
)
# 初始构建已通过预创建完成
# 文本提交事件:逐句渲染(为预创建文本框绑定)
for idx, tb in enumerate(textboxes):
def make_submit(i):
def _submit(new_text, lines_list, original_lines_list, model_sel_v, template_sel_v, speaker_v, key_shift_v, steps_v, bgm_volume_v):
# 仅处理当前可见范围内的行
if not isinstance(lines_list, list) or i >= max(len(lines_list), 0):
return gr.update(), gr.update(), gr.update(), lines_list
# 校验歌词格式
original_text = original_lines_list[i] if i < len(original_lines_list) else ""
is_valid, rendered_original = validate_lyric_format(new_text, original_text)
# 更新错误提示
if not is_valid:
error_msg = f"字数与原始文本不符:{rendered_original}"
error_update = gr.update(value=error_msg, visible=True)
else:
error_update = gr.update(value="", visible=False)
# 渲染音频
audio_path, err = render_single_line(model_sel_v, template_sel_v, i, new_text, speaker_v, key_shift_v, steps_v, bgm_volume_v)
if i < len(lines_list):
lines_list[i] = new_text
if err:
return gr.update(value=None), gr.update(value=f"❌ {err}", visible=True), error_update, lines_list
return gr.update(value=audio_path), gr.update(value="", visible=False), error_update, lines_list
return _submit
tb.submit(
fn=make_submit(idx),
inputs=[tb, lines_state, original_lines_state, model_sel, template_sel, speaker, key_shift, steps, bgm_volume],
outputs=[per_line_audio, per_line_error, error_markdowns[idx], lines_state],
)
# 生成整首(支持进度与中断)
def on_gen_or_stop(model_sel_v, template_sel_v, speaker_v, key_shift_v, steps_v, bgm_volume_v, lines, stop, generating, progress=gr.Progress(track_tqdm=True)):
# 若正在生成,本次点击作为“停止”信号,仅更新按钮与提示
if generating:
stop = True
return gr.update(), gr.update(), gr.update(value="生成整首"), gr.update(value="已请求停止,稍候..."), stop, generating
# 启动生成:切换按钮,清空输出,重置停止标志
stop = False
generating = True
yield gr.update(value=None), gr.update(value=None), gr.update(value="停止生成整首"), gr.update(value="开始生成..."), stop, generating
try:
mapping = find_templates()
if not model_sel_v:
raise gr.Error("请先选择模型")
if template_sel_v not in mapping:
raise gr.Error("未找到模板")
template_path = mapping[template_sel_v]
ds = load_ds(template_path)
if len(ds) != len(lines or []):
raise gr.Error("模板行数与编辑行数不一致")
model_path = ROOT / model_sel_v
segs_with_offsets = []
total = len(ds)
for idx, item in enumerate(ds):
if stop:
yield gr.update(), gr.update(), gr.update(value="生成整首"), gr.update(value=f"已中断,完成 {idx}/{total} 行。"), stop, False
return
text = (lines[idx] or "").strip()
if not text:
# 更新进度显示但不合成
progress((idx + 1) / total, desc=f"跳过空白句 {idx+1}/{total}")
yield gr.update(), gr.update(), gr.update(value="停止生成整首"), gr.update(value=f"跳过空白句 {idx+1}/{total}"), stop, True
continue
progress((idx + 1) / total, desc=f"渲染第 {idx+1}/{total} 句")
yield gr.update(), gr.update(), gr.update(value="停止生成整首"), gr.update(value=f"渲染第 {idx+1}/{total} 句..."), stop, True
h = param_hash(model_sel_v, speaker_v, key_shift_v, steps_v, text)
cache_dir = OUTPUT_DIR / template_sel_v / h
cache_dir.mkdir(parents=True, exist_ok=True)
wav_out = cache_dir / f"line_{idx+1}.wav"
if not wav_out.exists():
engine.synth_line(
model_path=model_path,
template_path=template_path,
line_index=idx,
text=text,
speaker=speaker_v or None,
key_shift=int(key_shift_v),
steps=int(steps_v),
out_wav=wav_out,
)
segs_with_offsets.append((audiosegment_from_file(wav_out), float(item.get("offset", 0.0))))
# 拼接输出
vocal_full = concat_with_offsets(segs_with_offsets)
final_dir = OUTPUT_DIR / template_sel_v / "final"
final_dir.mkdir(parents=True, exist_ok=True)
ts_tag = time.strftime("%Y%m%d_%H%M%S")
vocal_path = final_dir / f"{template_sel_v}_vocal_{ts_tag}.wav"
export_wav(vocal_full, vocal_path)
mixed_path = None
bgm_p = bgm_path_for(template_path)
if (bgm_volume_v and bgm_volume_v > 0.0) and bgm_p and bgm_p.exists():
mixed = mix_full_song(vocal_full, audiosegment_from_file(bgm_p), bgm_volume_v)
mixed_path = final_dir / f"{template_sel_v}_mixed_{ts_tag}.wav"
export_wav(mixed, mixed_path)
# 完成
yield gr.update(value=str(vocal_path)), gr.update(value=(str(mixed_path) if mixed_path else None)), gr.update(value="生成整首"), gr.update(value="已完成"), False, False
except Exception as e:
yield gr.update(value=None), gr.update(value=None), gr.update(value="生成整首"), gr.update(value=f"❌ 失败:{type(e).__name__}: {e}"), False, False
gen_btn.click(
fn=on_gen_or_stop,
inputs=[model_sel, template_sel, speaker, key_shift, steps, bgm_volume, lines_state, stop_flag, generating_flag],
outputs=[full_vocal, full_mixed, gen_btn, progress_md, stop_flag, generating_flag],
)
# 下载当前编辑后的 ds 和 lrc
def build_current_ds(template_sel_v, lines, offsets):
mapping = find_templates()
if not template_sel_v or template_sel_v not in mapping:
raise gr.Error("未选择有效模板")
tpl = mapping[template_sel_v]
ds = load_ds(tpl)
# 覆盖 text,并基于最新文本重算 ph_seq / ph_num
for i in range(min(len(ds), len(lines or []))):
new_text = lines[i] if lines and i < len(lines) else ds[i].get("text", "")
ds[i].replace(new_text)
# 输出到 output/pred_all//edits
ts_tag = time.strftime("%Y%m%d_%H%M%S")
out_dir = OUTPUT_DIR / template_sel_v / "edits"
out_dir.mkdir(parents=True, exist_ok=True)
# 生成DS文件
ds_path = out_dir / f"{template_sel_v}_edited_{ts_tag}.ds"
ds_path.write_text(json.dumps(ds, ensure_ascii=False, indent=2), encoding="utf-8")
# 生成LRC文件
lrc_path = out_dir / f"{template_sel_v}_edited_{ts_tag}.lrc"
lrc_content = generate_lrc_content(ds, lines, offsets)
lrc_path.write_text(lrc_content, encoding="utf-8")
# 生成纯字幕TXT文件
txt_path = out_dir / f"{template_sel_v}_edited_{ts_tag}.txt"
txt_content = generate_txt_content(ds, lines)
txt_path.write_text(txt_content, encoding="utf-8")
# 创建包含DS、LRC和TXT的压缩包
zip_path = out_dir / f"{template_sel_v}_edited_{ts_tag}.zip"
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
zipf.write(ds_path, ds_path.name)
zipf.write(lrc_path, lrc_path.name)
zipf.write(txt_path, txt_path.name)
return str(zip_path)
def generate_lrc_content(ds_data, lines, offsets):
"""生成LRC歌词文件内容"""
lrc_lines = []
# 添加LRC文件头信息
lrc_lines.append("[ar:DiffSinger]")
lrc_lines.append("[ti:Generated Song]")
lrc_lines.append("[al:DiffSinger WebUI]")
lrc_lines.append("[by:DiffSinger WebUI]")
lrc_lines.append("")
for i, sentence_data in enumerate(ds_data):
# 获取当前句子的歌词文本
if i < len(lines or []) and lines[i]:
lyric_text = lines[i]
else:
lyric_text = sentence_data.get("text", "")
# 清理歌词文本,移除AP、SP等标记并去掉所有空格
display_text = clean_lyric_for_display(lyric_text)
# 如果有实际歌词内容才添加到LRC
if display_text.strip():
# 使用offset作为开始时间
start_time = 0.0
if i < len(offsets or []) and offsets[i] is not None:
start_time = float(offsets[i])
# 格式化时间戳 [mm:ss.xx]
minutes = int(start_time // 60)
seconds = start_time % 60
time_stamp = f"[{minutes:02d}:{seconds:05.2f}]"
lrc_lines.append(f"{time_stamp}{display_text}")
return "\n".join(lrc_lines)
def generate_txt_content(ds_data, lines):
"""生成纯字幕TXT文件内容"""
txt_lines = []
for i, sentence_data in enumerate(ds_data):
# 获取当前句子的歌词文本
if i < len(lines or []) and lines[i]:
lyric_text = lines[i]
else:
lyric_text = sentence_data.get("text", "")
# 清理歌词文本,移除AP、SP等标记并去掉所有空格
display_text = clean_lyric_for_display(lyric_text)
# 如果有实际歌词内容才添加到TXT
if display_text.strip():
txt_lines.append(display_text)
return "\n".join(txt_lines)
def clean_lyric_for_display(lyric_text):
"""清理歌词文本,移除AP、SP等标记并去掉所有空格,用于LRC显示"""
if not lyric_text:
return ""
# 移除AP、SP标记
cleaned = re.sub(r'\b(AP|SP)\b', '', lyric_text)
# 去掉所有空格
cleaned = re.sub(r'\s+', '', cleaned)
return cleaned
download_btn.click(
fn=build_current_ds,
inputs=[template_sel, lines_state, offsets_state],
outputs=[download_btn],
)
# 上传模板:保存后通过更新模板下拉触发重建(复用模板切换事件)
def on_upload(file_obj, current_template):
name = save_uploaded_template(file_obj)
if name.startswith("ERROR::"):
return gr.update(choices=get_template_choices_and_bgm_visible(), value=current_template), gr.update(value=f"❌ {name[7:]}", visible=True)
# 成功:将模板选择切换为新模板,触发 on_template_change 自动重建文本框
return gr.update(choices=get_template_choices_and_bgm_visible(), value=name), gr.update(value="", visible=False)
upload.upload(
fn=on_upload,
inputs=[upload, template_sel],
outputs=[template_sel, per_line_error],
)
# 进入页面时,自动刷新 speaker 与模板/BGM/歌词区
def on_app_load(model_path_rel, template_name):
spk_upd = on_model_change(model_path_rel)
tpl_upds = on_template_change(template_name)
return (spk_upd, *tpl_upds)
# AI歌词功能事件绑定
def handle_generate_prompt(lines_list):
"""处理生成prompt"""
# 将当前歌词列表合并为文本
lyrics_text = '\n'.join(lines_list) if lines_list else ""
prompt = generate_ai_prompt(lyrics_text)
return prompt
def handle_apply_lyrics(ai_lyrics, lines_list, original_lines_list):
"""处理应用歌词到文本框"""
if not ai_lyrics or not ai_lyrics.strip():
gr.Warning("请先输入回填歌词")
return [gr.update() for _ in textboxes] + [gr.update() for _ in error_markdowns] + [lines_list]
# 分割歌词为行,保留空行以保持行数一致
ai_lines = [line.strip() for line in ai_lyrics.split('\n')]
# 检查行数是否一致
if len(ai_lines) != len(lines_list):
gr.Warning(f"行数不匹配:AI歌词有{len(ai_lines)}行,原始歌词有{len(lines_list)}行")
return [gr.update() for _ in textboxes] + [gr.update() for _ in error_markdowns] + [lines_list]
# 更新文本框和状态,同时校验每句格式
textbox_updates = []
error_updates = []
new_lines = ai_lines[:len(lines_list)] # 直接使用AI歌词,截断到原始长度
has_errors = False
# 更新所有文本框并校验格式
for i in range(len(textboxes)):
if i < len(new_lines):
textbox_updates.append(gr.update(value=new_lines[i]))
# 校验当前句子格式
if i < len(original_lines_list):
original_text = original_lines_list[i]
new_text = new_lines[i]
is_valid, rendered_original = validate_lyric_format(new_text, original_text)
if not is_valid:
error_msg = f"字数与原始文本不符:{rendered_original}"
error_updates.append(gr.update(value=error_msg, visible=True))
has_errors = True
else:
error_updates.append(gr.update(value="", visible=False))
else:
error_updates.append(gr.update(value="", visible=False))
else:
textbox_updates.append(gr.update())
error_updates.append(gr.update(value="", visible=False))
# 补齐剩余的错误提示更新
while len(error_updates) < len(error_markdowns):
error_updates.append(gr.update(value="", visible=False))
if has_errors:
gr.Warning("歌词应用成功,但部分句子格式有误,请检查红色提示")
else:
gr.Info("歌词应用成功!所有句子格式正确")
return textbox_updates + error_updates + [new_lines]
# 生成prompt按钮
generate_prompt_btn.click(
fn=handle_generate_prompt,
inputs=[lines_state],
outputs=[ai_prompt_display]
)
# 使用Gradio内置的复制功能
copy_prompt_btn.click(
fn=lambda x: x, # 直接返回文本内容
inputs=[ai_prompt_display],
outputs=[],
js="(text) => { navigator.clipboard.writeText(text); return text; }"
)
apply_lyrics_btn.click(
fn=handle_apply_lyrics,
inputs=[ai_lyrics_input, lines_state, original_lines_state],
outputs=[*textboxes, *error_markdowns, lines_state]
)
demo.load(
fn=on_app_load,
inputs=[model_sel, template_sel],
outputs=[speaker, bgm_volume, template_sel, lines_state, offsets_state, per_line_error, full_mixed, original_lines_state, *textboxes, *error_markdowns],
)
return demo
def main():
demo = build_ui()
demo.launch()
if __name__ == "__main__":
main()