BrilliantCoolHuge commited on
Commit
3ed0779
·
verified ·
1 Parent(s): 4b27ebf

AI生成歌词功能

Browse files
Files changed (1) hide show
  1. app.py +387 -180
app.py CHANGED
@@ -6,49 +6,19 @@ import sys
6
  import hashlib
7
  import time
8
  import math
9
- from typing import List, Dict, Optional, Tuple
 
 
10
  from pathlib import Path
11
 
12
  import numpy as np
13
  from pydub import AudioSegment
14
- # 引入与 sample.py 一致的组件
15
- try:
16
- from diffsinger_utau.voice_bank import PredAll
17
- from diffsinger_utau.voice_bank.commons.ds_reader import DSReader
18
- from diffsinger_utau.voice_bank.commons.phome_num_counter import Phome
19
- from pypinyin import pinyin, Style
20
- from pypinyin.constants import RE_HANS
21
- except Exception:
22
- PredAll = None
23
- DSReader = None
24
- Phome = None
25
- Style = None
26
- RE_HANS = None
27
-
28
- def get_opencpop_dict(path: str = str(Path("dictionaries") / "opencpop-extension.txt")) -> Dict[str, str]:
29
- result = {"AP": "AP", "SP": "SP"}
30
- p = Path(path)
31
- if not p.exists():
32
- return result
33
- with p.open("r", encoding="utf-8") as f:
34
- for line in f:
35
- if "\t" in line:
36
- k, v = line.split("\t", 1)
37
- result[k.strip()] = v.strip()
38
- return result
39
-
40
- def get_phonemes(text: str, opencpop_dict: Dict[str, str]) -> List[str]:
41
- if Style is None:
42
- # 无 pypinyin 时,退化为逐字符
43
- return [opencpop_dict.get(ch, ch) for ch in list(text)]
44
- pys = [x[0] for x in pinyin(text, style=Style.NORMAL)]
45
- result: List[str] = []
46
- for py in pys:
47
- py = py.strip()
48
- if not py:
49
- continue
50
- result.append(opencpop_dict.get(py, py))
51
- return " ".join(result).split()
52
 
53
  # —— 文本预处理:相邻纯汉字不加空格,其余保留空格 ——
54
  def _is_hans_token(s: str) -> bool:
@@ -73,6 +43,91 @@ def preprocess_zh_spaces(text: str) -> str:
73
  out.append(" " + part)
74
  return "".join(out)
75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  # 试图导入 diffsinger-utau(按要求使用该库,而非自行实现)
77
  try:
78
  import diffsinger_utau # 类型: 忽略
@@ -135,28 +190,9 @@ def bgm_path_for(template_path: Path) -> Optional[Path]:
135
  return None
136
 
137
 
138
- def load_ds(template_path: Path) -> List[Dict]:
139
- # ds: 一个 list,每个元素为 dict,至少包含 text;可包含 offset(秒)
140
- with open(template_path, "r", encoding="utf-8") as f:
141
- data = json.load(f)
142
- if not isinstance(data, list):
143
- raise ValueError("ds 模板需要是一个 list")
144
- # 标准化
145
- norm = []
146
- for i, item in enumerate(data):
147
- if not isinstance(item, dict):
148
- raise ValueError(f"ds 第 {i+1} 个元素不是 dict")
149
- text = item.get("text", "")
150
- if not isinstance(text, str):
151
- raise ValueError(f"ds 第 {i+1} 个元素的 text 不是字符串")
152
- offset = item.get("offset", 0.0)
153
- try:
154
- offset = float(offset)
155
- except Exception:
156
- offset = 0.0
157
- norm.append({"text": text, "offset": offset, **item})
158
- return norm
159
-
160
 
161
  def audiosegment_from_file(path: Path) -> AudioSegment:
162
  return AudioSegment.from_file(str(path))
@@ -167,7 +203,7 @@ def export_wav(seg: AudioSegment, path: Path):
167
  seg.export(str(path), format="wav")
168
 
169
 
170
- def overlay_bgm_snippet(vocal_wav: Path, bgm_audio: AudioSegment, offset_sec: float, bgm_volume: float = 1.0, vocal_gain_db: float = 0.0) -> AudioSegment:
171
  vocal = audiosegment_from_file(vocal_wav)
172
  if vocal_gain_db != 0.0:
173
  vocal = vocal + vocal_gain_db
@@ -184,7 +220,8 @@ def overlay_bgm_snippet(vocal_wav: Path, bgm_audio: AudioSegment, offset_sec: fl
184
  pad_ms = start_ms + len(vocal) - len(base)
185
  base = base + AudioSegment.silent(duration=pad_ms)
186
  mixed = base.overlay(vocal, position=start_ms)
187
- return mixed[start_ms : start_ms + len(vocal)]
 
188
 
189
 
190
  def concat_with_offsets(clips: List[Tuple[AudioSegment, float]]) -> AudioSegment:
@@ -202,7 +239,7 @@ def concat_with_offsets(clips: List[Tuple[AudioSegment, float]]) -> AudioSegment
202
  return timeline
203
 
204
 
205
- def mix_full_song(vocal: AudioSegment, bgm: AudioSegment, bgm_volume: float = 1.0) -> AudioSegment:
206
  # 保证两者同长度
207
  if len(bgm) < len(vocal):
208
  bgm = bgm + AudioSegment.silent(duration=(len(vocal) - len(bgm)))
@@ -217,6 +254,62 @@ def mix_full_song(vocal: AudioSegment, bgm: AudioSegment, bgm_volume: float = 1.
217
  return bgm_adj.overlay(vocal)
218
 
219
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220
  def param_hash(model_sel: str, speaker: str, key_shift: int, steps: int, text: str) -> str:
221
  s = json.dumps(
222
  {
@@ -232,94 +325,16 @@ def param_hash(model_sel: str, speaker: str, key_shift: int, steps: int, text: s
232
  )
233
  return hashlib.md5(s.encode("utf-8")).hexdigest()[:16]
234
 
235
-
236
- class DiffSingerEngine:
237
- def __init__(self):
238
- self.impl = diffsinger_utau
239
- self.entry = None
240
- if self.impl is not None:
241
- # 尝试发现可用入口函数(不同版本可能不同)
242
- candidates = ["synthesize", "synth", "infer", "generate", "tts"]
243
- for name in candidates:
244
- fn = getattr(self.impl, name, None)
245
- if callable(fn):
246
- self.entry = fn
247
- break
248
-
249
- def is_ready(self) -> bool:
250
- return self.impl is not None and self.entry is not None
251
-
252
- def synth_once(
253
- self,
254
- model_path: Path,
255
- text: str,
256
- speaker: Optional[str],
257
- key_shift: int,
258
- steps: int,
259
- out_wav: Path,
260
- ) -> None:
261
- """
262
- 使用 diffsinger-utau 渲染单句音频到 out_wav。
263
- 如果不同版本签名不同,将尝试多种参数形式。
264
- """
265
- if not self.is_ready():
266
- raise RuntimeError(
267
- "未找到可用的 diffsinger-utau 推理入口。请确认已安装并与 torch==1.13.1 兼容。"
268
- )
269
-
270
- out_wav.parent.mkdir(parents=True, exist_ok=True)
271
-
272
- tried = []
273
-
274
- def call_or_record(fn, kwargs):
275
- tried.append({"fn": fn.__name__, "kwargs": list(kwargs.keys())})
276
- return fn(**kwargs)
277
-
278
- # 常见签名尝试
279
- errors = []
280
- for kwargs in [
281
- dict(model=str(model_path), text=text, speaker=speaker, key_shift=key_shift, steps=steps, out=str(out_wav)),
282
- dict(model=str(model_path), text=text, speaker=speaker, key_shift=key_shift, acoustic_steps=steps, out=str(out_wav)),
283
- dict(model_path=str(model_path), text=text, speaker=speaker, key_shift=key_shift, steps=steps, output=str(out_wav)),
284
- dict(model=str(model_path), text=text, key_shift=key_shift, steps=steps, out=str(out_wav)),
285
- dict(model=str(model_path), text=text, out=str(out_wav)),
286
- ]:
287
- # 移除 None
288
- kwargs = {k: v for k, v in kwargs.items() if v is not None}
289
- try:
290
- ret = call_or_record(self.entry, kwargs)
291
- # 若函数返回波形和采样率,也直接落盘
292
- if ret is not None and isinstance(ret, (tuple, list)) and len(ret) >= 2:
293
- wav, sr = ret[0], ret[1]
294
- import soundfile as sf # 懒加载
295
- sf.write(str(out_wav), np.asarray(wav, dtype=np.float32), int(sr))
296
- # 若 out_wav 成功生成,结束
297
- if out_wav.exists() and out_wav.stat().st_size > 0:
298
- return
299
- except Exception as e:
300
- errors.append(f"{e}")
301
-
302
- raise RuntimeError(
303
- "调用 diffsinger-utau 失败。已尝试多种签名:"
304
- + json.dumps(tried, ensure_ascii=False)
305
- + f";错误示例:{errors[-1] if errors else '未知'}"
306
- )
307
-
308
-
309
- engine = DiffSingerEngine()
310
-
311
  class DSUEngine:
312
  """
313
- 基于 voice_bank.PredAll 的推理引擎;若不可用则回退到 DiffSingerEngine。
314
  """
315
- def __init__(self, old_engine: DiffSingerEngine):
316
- self.old = old_engine
317
  self.available = PredAll is not None and DSReader is not None
318
- self.predictors: Dict[str, object] = {} # model_path -> PredAll 实例
319
- self.opencpop = get_opencpop_dict()
320
 
321
  def is_ready(self) -> bool:
322
- return self.available or self.old.is_ready()
323
 
324
  def _get_predictor(self, model_path: Path):
325
  key = str(model_path.resolve())
@@ -341,17 +356,11 @@ class DSUEngine:
341
  if self.available:
342
  predictor = self._get_predictor(model_path)
343
  # 读取 ds,并替换目标行文本与必要的音素
344
- ds_list = DSReader(str(template_path)).read_ds()
345
  if not (0 <= line_index < len(ds_list)):
346
  raise IndexError("行索引越界")
347
  ds = ds_list[line_index]
348
- old_text = ds.get("text", "")
349
- ds["text"] = text
350
- # 若文本变化或缺少音素信息,则基于新文本重算音素
351
- if (text != old_text) or (not ds.get("ph_seq")) or (not ds.get("ph_num")):
352
- phonemes = get_phonemes(text, self.opencpop)
353
- ds["ph_seq"] = " ".join(phonemes)
354
- ds["ph_num"] = " ".join(map(str, Phome(phonemes).get_ph_num())) if Phome else ""
355
 
356
  # 选择说话人
357
  spk = speaker
@@ -387,11 +396,9 @@ class DSUEngine:
387
  if not out_wav.exists() or out_wav.stat().st_size == 0:
388
  raise RuntimeError("未能生成音频文件")
389
  else:
390
- # 回退旧引擎(不依赖 ds)
391
- self.old.synth_once(model_path, text, speaker, key_shift, steps, out_wav)
392
 
393
- # DSUEngine 覆盖默认引擎
394
- engine = DSUEngine(engine)
395
 
396
 
397
  def get_template_choices_and_bgm_visible():
@@ -410,7 +417,7 @@ def on_select_template(template_name: str):
410
  ds = load_ds(p)
411
  lines = [preprocess_zh_spaces(item.get("text", "")) for item in ds]
412
  offsets = [float(item.get("offset", 0.0)) for item in ds]
413
- bgm_update = gr.update(visible=(bgm is not None), value=(1.0 if bgm is not None else 0.0))
414
  return bgm_update, lines, offsets
415
 
416
 
@@ -605,10 +612,10 @@ def build_ui():
605
  template_sel = gr.Dropdown(choices=template_names, label="模板选择", value=(template_names[0] if template_names else None))
606
  with gr.Row(elem_classes=["compact-row"]):
607
  upload = gr.UploadButton("上传ds模板", file_types=[".ds"], elem_classes=["compact-btn"])
608
- download_btn = gr.DownloadButton(label="下载当前ds状态", elem_classes=["compact-btn"])
609
 
610
  with gr.Row():
611
- bgm_volume = gr.Slider(0.0, 2.0, value=1.0, step=0.01, label="BGM音量倍率", visible=False)
612
  key_shift = gr.Slider(-12, 12, value=0, step=1, label="音高偏移")
613
  steps = gr.Slider(1, 50, value=4, step=1, label="渲染步数")
614
  speaker = gr.Dropdown(label="演唱者", choices=[], value=None, interactive=True)
@@ -623,6 +630,18 @@ def build_ui():
623
  full_vocal = gr.Audio(label="整首(人声)", autoplay=False)
624
  full_mixed = gr.Audio(label="整首(混音)", autoplay=False, visible=False)
625
 
 
 
 
 
 
 
 
 
 
 
 
 
626
  # 右栏:模板与歌词编辑
627
  with gr.Column(elem_id="right-panel", scale=2):
628
  # 状态与歌词编辑容器(右栏仅歌词编辑)
@@ -633,8 +652,10 @@ def build_ui():
633
  generating_flag = gr.State(False)
634
  dyn = gr.Column()
635
 
636
- # 预创建文本框,依据当前模板设置初始可见性和值
637
  textboxes = []
 
 
638
  init_lines = []
639
  init_offsets = []
640
  if template_sel.value:
@@ -648,22 +669,29 @@ def build_ui():
648
  val = init_lines[i] if visible else ""
649
  tb = gr.Textbox(value=val, label=f"第 {i+1} 句", lines=1, max_lines=1, visible=visible)
650
  textboxes.append(tb)
 
 
 
651
  if template_sel.value:
652
  lines_state.value = init_lines
653
  offsets_state.value = init_offsets
 
654
 
655
  # 事件:选择模板时,更新 BGM 开关、整首混音可见性与文本框内容
656
  def on_template_change(template_name):
657
  bgm_update, lines, offsets = on_select_template(template_name)
658
  has_bgm = bool(bgm_update.get("visible", False)) if isinstance(bgm_update, dict) else False
659
  tb_updates = []
 
660
  n = len(lines)
661
  for i, tb in enumerate(textboxes):
662
  if i < n:
663
  tb_updates.append(gr.update(value=lines[i], visible=True))
 
664
  else:
665
  tb_updates.append(gr.update(value="", visible=False))
666
- # 返回:BGM、模板下拉、状态、错误清空、整首混音可见性(按是否存在BGM),以及所有文本框更新
 
667
  return (
668
  bgm_update,
669
  gr.update(choices=get_template_choices_and_bgm_visible(), value=template_name),
@@ -671,7 +699,9 @@ def build_ui():
671
  offsets,
672
  gr.update(value="", visible=False),
673
  gr.update(visible=has_bgm),
 
674
  *tb_updates,
 
675
  )
676
 
677
  # 模型切换:动态更新 speaker 下拉项
@@ -717,7 +747,7 @@ def build_ui():
717
  template_sel.change(
718
  fn=on_template_change,
719
  inputs=[template_sel],
720
- outputs=[bgm_volume, template_sel, lines_state, offsets_state, per_line_error, full_mixed, *textboxes],
721
  )
722
 
723
  # 上传模板:将用户 .ds 保存到 templates/user,并刷新模板下拉
@@ -758,21 +788,35 @@ def build_ui():
758
  # 文本提交事件:逐句渲染(为预创建文本框绑定)
759
  for idx, tb in enumerate(textboxes):
760
  def make_submit(i):
761
- def _submit(new_text, lines_list, model_sel_v, template_sel_v, speaker_v, key_shift_v, steps_v, bgm_volume_v):
762
  # 仅处理当前可见范围内的行
763
  if not isinstance(lines_list, list) or i >= max(len(lines_list), 0):
764
- return gr.update(), gr.update(), lines_list
 
 
 
 
 
 
 
 
 
 
 
 
 
765
  audio_path, err = render_single_line(model_sel_v, template_sel_v, i, new_text, speaker_v, key_shift_v, steps_v, bgm_volume_v)
766
  if i < len(lines_list):
767
  lines_list[i] = new_text
768
  if err:
769
- return gr.update(value=None), gr.update(value=f"❌ {err}", visible=True), lines_list
770
- return gr.update(value=audio_path), gr.update(value="", visible=False), lines_list
771
  return _submit
 
772
  tb.submit(
773
  fn=make_submit(idx),
774
- inputs=[tb, lines_state, model_sel, template_sel, speaker, key_shift, steps, bgm_volume],
775
- outputs=[per_line_audio, per_line_error, lines_state],
776
  )
777
 
778
  # 生成整首(支持进度与中断)
@@ -858,7 +902,7 @@ def build_ui():
858
  outputs=[full_vocal, full_mixed, gen_btn, progress_md, stop_flag, generating_flag],
859
  )
860
 
861
- # 下载当前编辑��的 ds
862
  def build_current_ds(template_sel_v, lines, offsets):
863
  mapping = find_templates()
864
  if not template_sel_v or template_sel_v not in mapping:
@@ -868,17 +912,104 @@ def build_ui():
868
  # 覆盖 text,并基于最新文本重算 ph_seq / ph_num
869
  for i in range(min(len(ds), len(lines or []))):
870
  new_text = lines[i] if lines and i < len(lines) else ds[i].get("text", "")
871
- ds[i]["text"] = new_text
872
- phonemes = get_phonemes(new_text, get_opencpop_dict())
873
- ds[i]["ph_seq"] = " ".join(phonemes)
874
- ds[i]["ph_num"] = " ".join(map(str, Phome(phonemes).get_ph_num())) if Phome else ds[i].get("ph_num", "")
875
  # 输出到 output/pred_all/<template>/edits
876
  ts_tag = time.strftime("%Y%m%d_%H%M%S")
877
  out_dir = OUTPUT_DIR / template_sel_v / "edits"
878
  out_dir.mkdir(parents=True, exist_ok=True)
879
- out_path = out_dir / f"{template_sel_v}_edited_{ts_tag}.ds"
880
- out_path.write_text(json.dumps(ds, ensure_ascii=False, indent=2), encoding="utf-8")
881
- return str(out_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
882
 
883
  download_btn.click(
884
  fn=build_current_ds,
@@ -906,10 +1037,86 @@ def build_ui():
906
  tpl_upds = on_template_change(template_name)
907
  return (spk_upd, *tpl_upds)
908
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
909
  demo.load(
910
  fn=on_app_load,
911
  inputs=[model_sel, template_sel],
912
- outputs=[speaker, bgm_volume, template_sel, lines_state, offsets_state, per_line_error, full_mixed, *textboxes],
913
  )
914
 
915
  return demo
@@ -917,7 +1124,7 @@ def build_ui():
917
 
918
  def main():
919
  demo = build_ui()
920
- demo.launch(show_error=True)
921
 
922
 
923
  if __name__ == "__main__":
 
6
  import hashlib
7
  import time
8
  import math
9
+ import re
10
+ import zipfile
11
+ from typing import Any, List, Dict, Optional, Tuple
12
  from pathlib import Path
13
 
14
  import numpy as np
15
  from pydub import AudioSegment
16
+
17
+ from diffsinger_utau.voice_bank import PredAll
18
+ from diffsinger_utau.voice_bank.commons.ds_reader import DSReader
19
+ from diffsinger_utau.voice_bank.commons.phome_num_counter import Phome
20
+ from pypinyin import pinyin, Style
21
+ from pypinyin.constants import RE_HANS
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  # —— 文本预处理:相邻纯汉字不加空格,其余保留空格 ——
24
  def _is_hans_token(s: str) -> bool:
 
43
  out.append(" " + part)
44
  return "".join(out)
45
 
46
+
47
+ def validate_lyric_format(modified_text: str, original_text: str) -> Tuple[bool, str]:
48
+ """
49
+ 校验歌词格式是否与原始文本匹配
50
+ 返回: (是否匹配, 渲染后的原始文本或空字符串)
51
+ """
52
+ if not original_text:
53
+ return True, ""
54
+
55
+ # 去掉空格后比较
56
+ modified_clean = re.sub(r'\s+', '', modified_text)
57
+ original_clean = re.sub(r'\s+', '', original_text)
58
+
59
+ # 长度检查
60
+ if len(modified_clean) != len(original_clean):
61
+ return False, render_original_with_highlights(original_text, modified_text)
62
+
63
+ # AP/SP 位置检查
64
+ modified_ap_sp_positions = []
65
+ original_ap_sp_positions = []
66
+
67
+ # 找到修改后文本中的 AP/SP 位置
68
+ for match in re.finditer(r'\b(AP|SP)\b', modified_text):
69
+ modified_ap_sp_positions.append((match.start(), match.group()))
70
+
71
+ # 找到原始文本中的 AP/SP 位置
72
+ for match in re.finditer(r'\b(AP|SP)\b', original_text):
73
+ original_ap_sp_positions.append((match.start(), match.group()))
74
+
75
+ # 比较 AP/SP 的数量和类型
76
+ if len(modified_ap_sp_positions) != len(original_ap_sp_positions):
77
+ return False, render_original_with_highlights(original_text, modified_text)
78
+
79
+ # 检查每个 AP/SP 的相对位置是否一致
80
+ for (mod_pos, mod_type), (orig_pos, orig_type) in zip(modified_ap_sp_positions, original_ap_sp_positions):
81
+ if mod_type != orig_type:
82
+ return False, render_original_with_highlights(original_text, modified_text)
83
+
84
+ # 计算相对位置(在去空格后的字符串中)
85
+ mod_relative_pos = len(re.sub(r'\s+', '', modified_text[:mod_pos]))
86
+ orig_relative_pos = len(re.sub(r'\s+', '', original_text[:orig_pos]))
87
+
88
+ if mod_relative_pos != orig_relative_pos:
89
+ return False, render_original_with_highlights(original_text, modified_text)
90
+
91
+ return True, ""
92
+
93
+
94
+ def render_original_with_highlights(original_text: str, modified_text: str) -> str:
95
+ """
96
+ 渲染原始文本,用灰色字体显示,位置不一致的 AP/SP 用红色标记
97
+ """
98
+ # 找到修改后和原始文本中的 AP/SP 位置
99
+ modified_ap_sp = set()
100
+ original_ap_sp = set()
101
+
102
+ for match in re.finditer(r'\b(AP|SP)\b', modified_text):
103
+ pos = len(re.sub(r'\s+', '', modified_text[:match.start()]))
104
+ modified_ap_sp.add((pos, match.group()))
105
+
106
+ result_parts = []
107
+ i = 0
108
+ clean_pos = 0
109
+
110
+ while i < len(original_text):
111
+ # 检查当前位置是否是 AP 或 SP
112
+ if original_text[i:i+2] in ['AP', 'SP'] and (i == 0 or not original_text[i-1].isalnum()) and (i+2 >= len(original_text) or not original_text[i+2].isalnum()):
113
+ ap_sp = original_text[i:i+2]
114
+ # 检查这个 AP/SP 在修改后的文本中是否在相同位置
115
+ if (clean_pos, ap_sp) not in modified_ap_sp:
116
+ result_parts.append(f'<span style="color: red;">{ap_sp}</span>')
117
+ else:
118
+ result_parts.append(ap_sp)
119
+ i += 2
120
+ clean_pos += 2
121
+ elif original_text[i].isspace():
122
+ result_parts.append(original_text[i])
123
+ i += 1
124
+ else:
125
+ result_parts.append(original_text[i])
126
+ i += 1
127
+ clean_pos += 1
128
+
129
+ return f'<span style="color: gray;">{"".join(result_parts)}</span>'
130
+
131
  # 试图导入 diffsinger-utau(按要求使用该库,而非自行实现)
132
  try:
133
  import diffsinger_utau # 类型: 忽略
 
190
  return None
191
 
192
 
193
+ def load_ds(template_path: Path):
194
+ ds = DSReader(template_path).read_ds()
195
+ return ds
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
 
197
  def audiosegment_from_file(path: Path) -> AudioSegment:
198
  return AudioSegment.from_file(str(path))
 
203
  seg.export(str(path), format="wav")
204
 
205
 
206
+ def overlay_bgm_snippet(vocal_wav: Path, bgm_audio: AudioSegment, offset_sec: float, bgm_volume: float = 0.3, vocal_gain_db: float = 0.0) -> AudioSegment:
207
  vocal = audiosegment_from_file(vocal_wav)
208
  if vocal_gain_db != 0.0:
209
  vocal = vocal + vocal_gain_db
 
220
  pad_ms = start_ms + len(vocal) - len(base)
221
  base = base + AudioSegment.silent(duration=pad_ms)
222
  mixed = base.overlay(vocal, position=start_ms)
223
+
224
+ return mixed[start_ms : start_ms + len(vocal)] # pyright: ignore[reportReturnType]
225
 
226
 
227
  def concat_with_offsets(clips: List[Tuple[AudioSegment, float]]) -> AudioSegment:
 
239
  return timeline
240
 
241
 
242
+ def mix_full_song(vocal: AudioSegment, bgm: AudioSegment, bgm_volume: float = 0.3) -> AudioSegment:
243
  # 保证两者同长度
244
  if len(bgm) < len(vocal):
245
  bgm = bgm + AudioSegment.silent(duration=(len(vocal) - len(bgm)))
 
254
  return bgm_adj.overlay(vocal)
255
 
256
 
257
+ def copy_prompt_to_clipboard(lyrics_text: str) -> str:
258
+ """生成AI歌词的prompt"""
259
+ processed_lyrics = preprocess_zh_spaces(lyrics_text) if lyrics_text else ""
260
+
261
+ prompt = f"""这是原始歌词:
262
+ ```txt
263
+ {processed_lyrics}
264
+ ```
265
+
266
+ 其中SP和AP分别代表停顿和呼吸。你应该保留原始格式,然后按照要求替换歌词。
267
+
268
+ 保留原始格式的意思是每句歌词字数应该保持不变。
269
+
270
+ 比如 "AP 试着 SP 掬一把星辰 SP 在手心 SP" 修改为 "AP 天空 SP 赤色的晚霞 SP 刚散去 SP" 就是符合要求的。如果有多字、少字或者AP, SP位置不对,都是不符合要求的。
271
+
272
+ 现在请帮我基于上述原始歌词模板,写一首歌曲《历史的进程推着人前进》,主题为:个人奋斗固然重要,但是历史进程更加浩浩汤汤。"""
273
+
274
+ # 复制到剪切板
275
+ try:
276
+ import pyperclip
277
+ pyperclip.copy(prompt)
278
+ return "Prompt已复制到剪切板!"
279
+ except ImportError:
280
+ # 如果没有pyperclip,使用系统命令
281
+ try:
282
+ import subprocess
283
+ import platform
284
+ if platform.system() == "Darwin": # macOS
285
+ subprocess.run(["pbcopy"], input=prompt.encode(), check=True)
286
+ elif platform.system() == "Linux":
287
+ subprocess.run(["xclip", "-selection", "clipboard"], input=prompt.encode(), check=True)
288
+ elif platform.system() == "Windows":
289
+ subprocess.run(["clip"], input=prompt.encode(), check=True)
290
+ return "Prompt已复制到剪切板!"
291
+ except Exception as e:
292
+ return f"复制��败:{e}"
293
+
294
+
295
+ def apply_ai_lyrics(ai_lyrics: str, original_lyrics: str) -> Tuple[str, str]:
296
+ """应用AI生成的歌词到右侧文本框"""
297
+ if not ai_lyrics or not ai_lyrics.strip():
298
+ return original_lyrics, "请先输入回填歌词"
299
+
300
+ # 分割歌词为行,保留空行
301
+ ai_lines = [line.strip() for line in ai_lyrics.split('\n')]
302
+ original_lines = [line.strip() for line in original_lyrics.split('\n')]
303
+
304
+ # 检查行数是否一致
305
+ if len(ai_lines) != len(original_lines):
306
+ return original_lyrics, f"行数不匹配:AI歌词有{len(ai_lines)}行,原始歌词有{len(original_lines)}行"
307
+
308
+ # 逐行替换
309
+ new_lyrics = '\n'.join(ai_lines)
310
+ return new_lyrics, "歌词应用成功!"
311
+
312
+
313
  def param_hash(model_sel: str, speaker: str, key_shift: int, steps: int, text: str) -> str:
314
  s = json.dumps(
315
  {
 
325
  )
326
  return hashlib.md5(s.encode("utf-8")).hexdigest()[:16]
327
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
328
  class DSUEngine:
329
  """
330
+ 基于 voice_bank.PredAll 的推理引擎;。
331
  """
332
+ def __init__(self):
 
333
  self.available = PredAll is not None and DSReader is not None
334
+ self.predictors: Dict[str, PredAll] = {} # model_path -> PredAll 实例
 
335
 
336
  def is_ready(self) -> bool:
337
+ return self.available
338
 
339
  def _get_predictor(self, model_path: Path):
340
  key = str(model_path.resolve())
 
356
  if self.available:
357
  predictor = self._get_predictor(model_path)
358
  # 读取 ds,并替换目标行文本与必要的音素
359
+ ds_list = DSReader(template_path).read_ds()
360
  if not (0 <= line_index < len(ds_list)):
361
  raise IndexError("行索引越界")
362
  ds = ds_list[line_index]
363
+ ds.replace(text)
 
 
 
 
 
 
364
 
365
  # 选择说话人
366
  spk = speaker
 
396
  if not out_wav.exists() or out_wav.stat().st_size == 0:
397
  raise RuntimeError("未能生成音频文件")
398
  else:
399
+ pass
 
400
 
401
+ engine = DSUEngine()
 
402
 
403
 
404
  def get_template_choices_and_bgm_visible():
 
417
  ds = load_ds(p)
418
  lines = [preprocess_zh_spaces(item.get("text", "")) for item in ds]
419
  offsets = [float(item.get("offset", 0.0)) for item in ds]
420
+ bgm_update = gr.update(visible=(bgm is not None), value=(0.3 if bgm is not None else 0.0))
421
  return bgm_update, lines, offsets
422
 
423
 
 
612
  template_sel = gr.Dropdown(choices=template_names, label="模板选择", value=(template_names[0] if template_names else None))
613
  with gr.Row(elem_classes=["compact-row"]):
614
  upload = gr.UploadButton("上传ds模板", file_types=[".ds"], elem_classes=["compact-btn"])
615
+ download_btn = gr.DownloadButton(label="下载ds&lrc", elem_classes=["compact-btn"])
616
 
617
  with gr.Row():
618
+ bgm_volume = gr.Slider(0.0, 2.0, value=0.3, step=0.01, label="BGM音量", visible=False)
619
  key_shift = gr.Slider(-12, 12, value=0, step=1, label="音高偏移")
620
  steps = gr.Slider(1, 50, value=4, step=1, label="渲染步数")
621
  speaker = gr.Dropdown(label="演唱者", choices=[], value=None, interactive=True)
 
630
  full_vocal = gr.Audio(label="整首(人声)", autoplay=False)
631
  full_mixed = gr.Audio(label="整首(混音)", autoplay=False, visible=False)
632
 
633
+ # AI歌词功能
634
+ with gr.Accordion("AI歌词", open=False):
635
+ gr.Markdown("由于服务器限制,请复制prompt,到大模型APP粘贴,并将结果回填")
636
+ copy_prompt_btn = gr.Button("复制prompt")
637
+ ai_lyrics_input = gr.Textbox(
638
+ label="回填歌词",
639
+ lines=10,
640
+ max_lines=15,
641
+ placeholder="请将大模型生成的歌词粘贴到这里..."
642
+ )
643
+ apply_lyrics_btn = gr.Button("应用歌词")
644
+
645
  # 右栏:模板与歌词编辑
646
  with gr.Column(elem_id="right-panel", scale=2):
647
  # 状态与歌词编辑容器(右栏仅歌词编辑)
 
652
  generating_flag = gr.State(False)
653
  dyn = gr.Column()
654
 
655
+ # 预创建文本框和错误提示,依据当前模板设置初始可见性和值
656
  textboxes = []
657
+ error_markdowns = []
658
+ original_lines_state = gr.State([]) # 存储原始歌词用于校验
659
  init_lines = []
660
  init_offsets = []
661
  if template_sel.value:
 
669
  val = init_lines[i] if visible else ""
670
  tb = gr.Textbox(value=val, label=f"第 {i+1} 句", lines=1, max_lines=1, visible=visible)
671
  textboxes.append(tb)
672
+ # 为每个文本框添加对应的错误提示
673
+ error_md = gr.Markdown("", visible=False)
674
+ error_markdowns.append(error_md)
675
  if template_sel.value:
676
  lines_state.value = init_lines
677
  offsets_state.value = init_offsets
678
+ original_lines_state.value = init_lines.copy()
679
 
680
  # 事件:选择模板时,更新 BGM 开关、整首混音可见性与文本框内容
681
  def on_template_change(template_name):
682
  bgm_update, lines, offsets = on_select_template(template_name)
683
  has_bgm = bool(bgm_update.get("visible", False)) if isinstance(bgm_update, dict) else False
684
  tb_updates = []
685
+ error_updates = []
686
  n = len(lines)
687
  for i, tb in enumerate(textboxes):
688
  if i < n:
689
  tb_updates.append(gr.update(value=lines[i], visible=True))
690
+ error_updates.append(gr.update(value="", visible=False))
691
  else:
692
  tb_updates.append(gr.update(value="", visible=False))
693
+ error_updates.append(gr.update(value="", visible=False))
694
+ # 返回:BGM、模板下拉、状态、错误清空、整首混音可见性(按是否存在BGM),原始歌词状态,以及所有文本框和错误提示更新
695
  return (
696
  bgm_update,
697
  gr.update(choices=get_template_choices_and_bgm_visible(), value=template_name),
 
699
  offsets,
700
  gr.update(value="", visible=False),
701
  gr.update(visible=has_bgm),
702
+ lines.copy(),
703
  *tb_updates,
704
+ *error_updates,
705
  )
706
 
707
  # 模型切换:动态更新 speaker 下拉项
 
747
  template_sel.change(
748
  fn=on_template_change,
749
  inputs=[template_sel],
750
+ outputs=[bgm_volume, template_sel, lines_state, offsets_state, per_line_error, full_mixed, original_lines_state, *textboxes, *error_markdowns],
751
  )
752
 
753
  # 上传模板:将用户 .ds 保存到 templates/user,并刷新模板下拉
 
788
  # 文本提交事件:逐句渲染(为预创建文本框绑定)
789
  for idx, tb in enumerate(textboxes):
790
  def make_submit(i):
791
+ def _submit(new_text, lines_list, original_lines_list, model_sel_v, template_sel_v, speaker_v, key_shift_v, steps_v, bgm_volume_v):
792
  # 仅处理当前可见范围内的行
793
  if not isinstance(lines_list, list) or i >= max(len(lines_list), 0):
794
+ return gr.update(), gr.update(), gr.update(), lines_list
795
+
796
+ # 校验歌词格式
797
+ original_text = original_lines_list[i] if i < len(original_lines_list) else ""
798
+ is_valid, rendered_original = validate_lyric_format(new_text, original_text)
799
+
800
+ # 更新错误提示
801
+ if not is_valid:
802
+ error_msg = f"字数与原始文本不符:{rendered_original}"
803
+ error_update = gr.update(value=error_msg, visible=True)
804
+ else:
805
+ error_update = gr.update(value="", visible=False)
806
+
807
+ # 渲染音频
808
  audio_path, err = render_single_line(model_sel_v, template_sel_v, i, new_text, speaker_v, key_shift_v, steps_v, bgm_volume_v)
809
  if i < len(lines_list):
810
  lines_list[i] = new_text
811
  if err:
812
+ return gr.update(value=None), gr.update(value=f"❌ {err}", visible=True), error_update, lines_list
813
+ return gr.update(value=audio_path), gr.update(value="", visible=False), error_update, lines_list
814
  return _submit
815
+
816
  tb.submit(
817
  fn=make_submit(idx),
818
+ inputs=[tb, lines_state, original_lines_state, model_sel, template_sel, speaker, key_shift, steps, bgm_volume],
819
+ outputs=[per_line_audio, per_line_error, error_markdowns[idx], lines_state],
820
  )
821
 
822
  # 生成整首(支持进度与中断)
 
902
  outputs=[full_vocal, full_mixed, gen_btn, progress_md, stop_flag, generating_flag],
903
  )
904
 
905
+ # 下载当前编辑后的 ds 和 lrc
906
  def build_current_ds(template_sel_v, lines, offsets):
907
  mapping = find_templates()
908
  if not template_sel_v or template_sel_v not in mapping:
 
912
  # 覆盖 text,并基于最新文本重算 ph_seq / ph_num
913
  for i in range(min(len(ds), len(lines or []))):
914
  new_text = lines[i] if lines and i < len(lines) else ds[i].get("text", "")
915
+ ds[i].replace(new_text)
916
+
 
 
917
  # 输出到 output/pred_all/<template>/edits
918
  ts_tag = time.strftime("%Y%m%d_%H%M%S")
919
  out_dir = OUTPUT_DIR / template_sel_v / "edits"
920
  out_dir.mkdir(parents=True, exist_ok=True)
921
+
922
+ # 生成DS文件
923
+ ds_path = out_dir / f"{template_sel_v}_edited_{ts_tag}.ds"
924
+ ds_path.write_text(json.dumps(ds, ensure_ascii=False, indent=2), encoding="utf-8")
925
+
926
+ # 生成LRC文件
927
+ lrc_path = out_dir / f"{template_sel_v}_edited_{ts_tag}.lrc"
928
+ lrc_content = generate_lrc_content(ds, lines, offsets)
929
+ lrc_path.write_text(lrc_content, encoding="utf-8")
930
+
931
+ # 生成纯字幕TXT文件
932
+ txt_path = out_dir / f"{template_sel_v}_edited_{ts_tag}.txt"
933
+ txt_content = generate_txt_content(ds, lines)
934
+ txt_path.write_text(txt_content, encoding="utf-8")
935
+
936
+ # 创建包含DS、LRC和TXT的压缩包
937
+ zip_path = out_dir / f"{template_sel_v}_edited_{ts_tag}.zip"
938
+ with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
939
+ zipf.write(ds_path, ds_path.name)
940
+ zipf.write(lrc_path, lrc_path.name)
941
+ zipf.write(txt_path, txt_path.name)
942
+
943
+ return str(zip_path)
944
+
945
+ def generate_lrc_content(ds_data, lines, offsets):
946
+ """生成LRC歌词文件内容"""
947
+ lrc_lines = []
948
+
949
+ # 添加LRC文件头信息
950
+ lrc_lines.append("[ar:DiffSinger]")
951
+ lrc_lines.append("[ti:Generated Song]")
952
+ lrc_lines.append("[al:DiffSinger WebUI]")
953
+ lrc_lines.append("[by:DiffSinger WebUI]")
954
+ lrc_lines.append("")
955
+
956
+ for i, sentence_data in enumerate(ds_data):
957
+ # 获取当前句子的歌词文本
958
+ if i < len(lines or []) and lines[i]:
959
+ lyric_text = lines[i]
960
+ else:
961
+ lyric_text = sentence_data.get("text", "")
962
+
963
+ # 清理歌词文本,移除AP、SP等标记并去掉所有空格
964
+ display_text = clean_lyric_for_display(lyric_text)
965
+
966
+ # 如果有实际歌词内容才添加到LRC
967
+ if display_text.strip():
968
+ # 使用offset作为开始时间
969
+ start_time = 0.0
970
+ if i < len(offsets or []) and offsets[i] is not None:
971
+ start_time = float(offsets[i])
972
+
973
+ # 格式化时间戳 [mm:ss.xx]
974
+ minutes = int(start_time // 60)
975
+ seconds = start_time % 60
976
+ time_stamp = f"[{minutes:02d}:{seconds:05.2f}]"
977
+
978
+ lrc_lines.append(f"{time_stamp}{display_text}")
979
+
980
+ return "\n".join(lrc_lines)
981
+
982
+ def generate_txt_content(ds_data, lines):
983
+ """生成纯字幕TXT文件内容"""
984
+ txt_lines = []
985
+
986
+ for i, sentence_data in enumerate(ds_data):
987
+ # 获取当前句子的歌词文本
988
+ if i < len(lines or []) and lines[i]:
989
+ lyric_text = lines[i]
990
+ else:
991
+ lyric_text = sentence_data.get("text", "")
992
+
993
+ # 清理歌词文本,移除AP、SP等标记并去掉所有空格
994
+ display_text = clean_lyric_for_display(lyric_text)
995
+
996
+ # 如果有实际歌词内容才添加到TXT
997
+ if display_text.strip():
998
+ txt_lines.append(display_text)
999
+
1000
+ return "\n".join(txt_lines)
1001
+
1002
+ def clean_lyric_for_display(lyric_text):
1003
+ """清理歌词文本,移除AP、SP等标记并去掉所有空格,用于LRC显示"""
1004
+ if not lyric_text:
1005
+ return ""
1006
+
1007
+ # 移除AP、SP标记
1008
+ cleaned = re.sub(r'\b(AP|SP)\b', '', lyric_text)
1009
+ # 去掉所有空格
1010
+ cleaned = re.sub(r'\s+', '', cleaned)
1011
+
1012
+ return cleaned
1013
 
1014
  download_btn.click(
1015
  fn=build_current_ds,
 
1037
  tpl_upds = on_template_change(template_name)
1038
  return (spk_upd, *tpl_upds)
1039
 
1040
+ # AI歌词功能事件绑定
1041
+ def handle_copy_prompt(lines_list):
1042
+ """处理复制prompt"""
1043
+ # 将当前歌词列表合并为文本
1044
+ lyrics_text = '\n'.join(lines_list) if lines_list else ""
1045
+ message = copy_prompt_to_clipboard(lyrics_text)
1046
+ gr.Info(message)
1047
+ return message
1048
+
1049
+ def handle_apply_lyrics(ai_lyrics, lines_list, original_lines_list):
1050
+ """处理应用歌词到文本框"""
1051
+ if not ai_lyrics or not ai_lyrics.strip():
1052
+ gr.Warning("请先输入回填歌词")
1053
+ return [gr.update() for _ in textboxes] + [gr.update() for _ in error_markdowns] + [lines_list]
1054
+
1055
+ # 分割歌词为行,保留空行以保持行数一致
1056
+ ai_lines = [line.strip() for line in ai_lyrics.split('\n')]
1057
+
1058
+ # 检查行数是否一致
1059
+ if len(ai_lines) != len(lines_list):
1060
+ gr.Warning(f"行数不匹配:AI歌词有{len(ai_lines)}行,原始歌词有{len(lines_list)}行")
1061
+ return [gr.update() for _ in textboxes] + [gr.update() for _ in error_markdowns] + [lines_list]
1062
+
1063
+ # 更新文本框和状态,同时校验每句格式
1064
+ textbox_updates = []
1065
+ error_updates = []
1066
+ new_lines = ai_lines[:len(lines_list)] # 直接使用AI歌词,截断到原始长度
1067
+
1068
+ has_errors = False
1069
+
1070
+ # 更新所有文本框并校验格式
1071
+ for i in range(len(textboxes)):
1072
+ if i < len(new_lines):
1073
+ textbox_updates.append(gr.update(value=new_lines[i]))
1074
+
1075
+ # 校验当前句子格式
1076
+ if i < len(original_lines_list):
1077
+ original_text = original_lines_list[i]
1078
+ new_text = new_lines[i]
1079
+ is_valid, rendered_original = validate_lyric_format(new_text, original_text)
1080
+
1081
+ if not is_valid:
1082
+ error_msg = f"字数与原始文本不符:{rendered_original}"
1083
+ error_updates.append(gr.update(value=error_msg, visible=True))
1084
+ has_errors = True
1085
+ else:
1086
+ error_updates.append(gr.update(value="", visible=False))
1087
+ else:
1088
+ error_updates.append(gr.update(value="", visible=False))
1089
+ else:
1090
+ textbox_updates.append(gr.update())
1091
+ error_updates.append(gr.update(value="", visible=False))
1092
+
1093
+ # 补齐剩余的错误提示更新
1094
+ while len(error_updates) < len(error_markdowns):
1095
+ error_updates.append(gr.update(value="", visible=False))
1096
+
1097
+ if has_errors:
1098
+ gr.Warning("歌词应用成功,但部分句子格式有误,请检查红色提示")
1099
+ else:
1100
+ gr.Info("歌词应用成功!所有句子格式正确")
1101
+
1102
+ return textbox_updates + error_updates + [new_lines]
1103
+
1104
+ copy_prompt_btn.click(
1105
+ fn=handle_copy_prompt,
1106
+ inputs=[lines_state],
1107
+ outputs=[]
1108
+ )
1109
+
1110
+ apply_lyrics_btn.click(
1111
+ fn=handle_apply_lyrics,
1112
+ inputs=[ai_lyrics_input, lines_state, original_lines_state],
1113
+ outputs=[*textboxes, *error_markdowns, lines_state]
1114
+ )
1115
+
1116
  demo.load(
1117
  fn=on_app_load,
1118
  inputs=[model_sel, template_sel],
1119
+ outputs=[speaker, bgm_volume, template_sel, lines_state, offsets_state, per_line_error, full_mixed, original_lines_state, *textboxes, *error_markdowns],
1120
  )
1121
 
1122
  return demo
 
1124
 
1125
  def main():
1126
  demo = build_ui()
1127
+ demo.launch()
1128
 
1129
 
1130
  if __name__ == "__main__":