Spaces:
Running
Running
将BGM开关改为音量条
Browse files
app.py
CHANGED
|
@@ -5,6 +5,7 @@ import os
|
|
| 5 |
import sys
|
| 6 |
import hashlib
|
| 7 |
import time
|
|
|
|
| 8 |
from typing import List, Dict, Optional, Tuple
|
| 9 |
from pathlib import Path
|
| 10 |
|
|
@@ -12,9 +13,9 @@ import numpy as np
|
|
| 12 |
from pydub import AudioSegment
|
| 13 |
# 引入与 sample.py 一致的组件
|
| 14 |
try:
|
| 15 |
-
from voice_bank import PredAll
|
| 16 |
-
from voice_bank.commons.ds_reader import DSReader
|
| 17 |
-
from voice_bank.commons.phome_num_counter import Phome
|
| 18 |
from pypinyin import pinyin, Style
|
| 19 |
from pypinyin.constants import RE_HANS
|
| 20 |
except Exception:
|
|
@@ -166,17 +167,23 @@ def export_wav(seg: AudioSegment, path: Path):
|
|
| 166 |
seg.export(str(path), format="wav")
|
| 167 |
|
| 168 |
|
| 169 |
-
def overlay_bgm_snippet(vocal_wav: Path, bgm_audio: AudioSegment, offset_sec: float,
|
| 170 |
vocal = audiosegment_from_file(vocal_wav)
|
| 171 |
-
if
|
| 172 |
-
vocal = vocal +
|
| 173 |
start_ms = max(int(offset_sec * 1000), 0)
|
| 174 |
-
#
|
| 175 |
-
if
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
return mixed[start_ms : start_ms + len(vocal)]
|
| 181 |
|
| 182 |
|
|
@@ -195,13 +202,19 @@ def concat_with_offsets(clips: List[Tuple[AudioSegment, float]]) -> AudioSegment
|
|
| 195 |
return timeline
|
| 196 |
|
| 197 |
|
| 198 |
-
def mix_full_song(vocal: AudioSegment, bgm: AudioSegment) -> AudioSegment:
|
| 199 |
# 保证两者同长度
|
| 200 |
if len(bgm) < len(vocal):
|
| 201 |
bgm = bgm + AudioSegment.silent(duration=(len(vocal) - len(bgm)))
|
| 202 |
else:
|
| 203 |
vocal = vocal + AudioSegment.silent(duration=(len(bgm) - len(vocal)))
|
| 204 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 205 |
|
| 206 |
|
| 207 |
def param_hash(model_sel: str, speaker: str, key_shift: int, steps: int, text: str) -> str:
|
|
@@ -391,13 +404,13 @@ def get_template_choices_and_bgm_visible():
|
|
| 391 |
def on_select_template(template_name: str):
|
| 392 |
mapping = find_templates()
|
| 393 |
if not template_name or template_name not in mapping:
|
| 394 |
-
return gr.update(visible=False, value=
|
| 395 |
p = mapping[template_name]
|
| 396 |
bgm = bgm_path_for(p)
|
| 397 |
ds = load_ds(p)
|
| 398 |
lines = [preprocess_zh_spaces(item.get("text", "")) for item in ds]
|
| 399 |
offsets = [float(item.get("offset", 0.0)) for item in ds]
|
| 400 |
-
bgm_update = gr.update(visible=(bgm is not None), value=(bgm is not None))
|
| 401 |
return bgm_update, lines, offsets
|
| 402 |
|
| 403 |
|
|
@@ -409,7 +422,7 @@ def render_single_line(
|
|
| 409 |
speaker: str,
|
| 410 |
key_shift: int,
|
| 411 |
steps: int,
|
| 412 |
-
|
| 413 |
) -> Tuple[Optional[str], Optional[str]]:
|
| 414 |
"""
|
| 415 |
渲染单句,返回:(音频路径, 错误消息)
|
|
@@ -451,10 +464,10 @@ def render_single_line(
|
|
| 451 |
|
| 452 |
# 是否混音预览
|
| 453 |
mapping_bgm = bgm_path_for(template_path)
|
| 454 |
-
if
|
| 455 |
bgm_audio = audiosegment_from_file(mapping_bgm)
|
| 456 |
offset = float(ds[line_index].get("offset", 0.0))
|
| 457 |
-
mixed_seg = overlay_bgm_snippet(wav_out, bgm_audio, offset)
|
| 458 |
preview_wav = cache_dir / f"line_{line_index+1}_preview.wav"
|
| 459 |
export_wav(mixed_seg, preview_wav)
|
| 460 |
return str(preview_wav), None
|
|
@@ -472,7 +485,7 @@ def generate_full_song(
|
|
| 472 |
speaker: str,
|
| 473 |
key_shift: int,
|
| 474 |
steps: int,
|
| 475 |
-
|
| 476 |
) -> Tuple[Optional[str], Optional[str]]:
|
| 477 |
"""
|
| 478 |
生成整曲,返回:(主���出音频路径, 混音输出音频路径或错误消息字符串)
|
|
@@ -523,9 +536,9 @@ def generate_full_song(
|
|
| 523 |
# 混音版本(如有 BGM 且开启开关)
|
| 524 |
mixed_path = None
|
| 525 |
bgm_p = bgm_path_for(template_path)
|
| 526 |
-
if
|
| 527 |
bgm_audio = audiosegment_from_file(bgm_p)
|
| 528 |
-
mixed = mix_full_song(vocal_full, bgm_audio)
|
| 529 |
mixed_path = final_dir / f"{template_name}_mixed_{ts_tag}.wav"
|
| 530 |
export_wav(mixed, mixed_path)
|
| 531 |
|
|
@@ -589,7 +602,7 @@ def build_ui():
|
|
| 589 |
download_btn = gr.DownloadButton(label="下载当前ds状态", elem_classes=["compact-btn"])
|
| 590 |
|
| 591 |
with gr.Row():
|
| 592 |
-
|
| 593 |
key_shift = gr.Slider(-12, 12, value=0, step=1, label="音高偏移")
|
| 594 |
steps = gr.Slider(1, 50, value=4, step=1, label="渲染步数")
|
| 595 |
speaker = gr.Dropdown(label="演唱者", choices=[], value=None, interactive=True)
|
|
@@ -680,17 +693,17 @@ def build_ui():
|
|
| 680 |
outputs=[speaker],
|
| 681 |
)
|
| 682 |
|
| 683 |
-
# BGM
|
| 684 |
-
def
|
| 685 |
mapping = find_templates()
|
| 686 |
has_bgm = False
|
| 687 |
if template_name in mapping:
|
| 688 |
has_bgm = bgm_path_for(mapping[template_name]) is not None
|
| 689 |
-
return gr.update(visible=bool(
|
| 690 |
|
| 691 |
-
|
| 692 |
-
fn=
|
| 693 |
-
inputs=[
|
| 694 |
outputs=[full_mixed],
|
| 695 |
)
|
| 696 |
|
|
@@ -698,7 +711,7 @@ def build_ui():
|
|
| 698 |
template_sel.change(
|
| 699 |
fn=on_template_change,
|
| 700 |
inputs=[template_sel],
|
| 701 |
-
outputs=[
|
| 702 |
)
|
| 703 |
|
| 704 |
# 上传模板:将用户 .ds 保存到 templates/user,并刷新模板下拉
|
|
@@ -739,11 +752,11 @@ def build_ui():
|
|
| 739 |
# 文本提交事件:逐句渲染(为预创建文本框绑定)
|
| 740 |
for idx, tb in enumerate(textboxes):
|
| 741 |
def make_submit(i):
|
| 742 |
-
def _submit(new_text, lines_list, model_sel_v, template_sel_v, speaker_v, key_shift_v, steps_v,
|
| 743 |
# 仅处理当前可见范围内的行
|
| 744 |
if not isinstance(lines_list, list) or i >= max(len(lines_list), 0):
|
| 745 |
return gr.update(), gr.update(), lines_list
|
| 746 |
-
audio_path, err = render_single_line(model_sel_v, template_sel_v, i, new_text, speaker_v, key_shift_v, steps_v,
|
| 747 |
if i < len(lines_list):
|
| 748 |
lines_list[i] = new_text
|
| 749 |
if err:
|
|
@@ -752,12 +765,12 @@ def build_ui():
|
|
| 752 |
return _submit
|
| 753 |
tb.submit(
|
| 754 |
fn=make_submit(idx),
|
| 755 |
-
inputs=[tb, lines_state, model_sel, template_sel, speaker, key_shift, steps,
|
| 756 |
outputs=[per_line_audio, per_line_error, lines_state],
|
| 757 |
)
|
| 758 |
|
| 759 |
# 生成整首(支持进度与中断)
|
| 760 |
-
def on_gen_or_stop(model_sel_v, template_sel_v, speaker_v, key_shift_v, steps_v,
|
| 761 |
# 若正在生成,本次点击作为“停止”信号,仅更新按钮与提示
|
| 762 |
if generating:
|
| 763 |
stop = True
|
|
@@ -823,8 +836,8 @@ def build_ui():
|
|
| 823 |
|
| 824 |
mixed_path = None
|
| 825 |
bgm_p = bgm_path_for(template_path)
|
| 826 |
-
if
|
| 827 |
-
mixed = mix_full_song(vocal_full, audiosegment_from_file(bgm_p))
|
| 828 |
mixed_path = final_dir / f"{template_sel_v}_mixed_{ts_tag}.wav"
|
| 829 |
export_wav(mixed, mixed_path)
|
| 830 |
|
|
@@ -835,7 +848,7 @@ def build_ui():
|
|
| 835 |
|
| 836 |
gen_btn.click(
|
| 837 |
fn=on_gen_or_stop,
|
| 838 |
-
inputs=[model_sel, template_sel, speaker, key_shift, steps,
|
| 839 |
outputs=[full_vocal, full_mixed, gen_btn, progress_md, stop_flag, generating_flag],
|
| 840 |
)
|
| 841 |
|
|
@@ -890,15 +903,20 @@ def build_ui():
|
|
| 890 |
demo.load(
|
| 891 |
fn=on_app_load,
|
| 892 |
inputs=[model_sel, template_sel],
|
| 893 |
-
outputs=[speaker,
|
| 894 |
)
|
| 895 |
|
| 896 |
return demo
|
| 897 |
|
| 898 |
|
| 899 |
def main():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 900 |
demo = build_ui()
|
| 901 |
-
demo.launch()
|
| 902 |
|
| 903 |
|
| 904 |
if __name__ == "__main__":
|
|
|
|
| 5 |
import sys
|
| 6 |
import hashlib
|
| 7 |
import time
|
| 8 |
+
import math
|
| 9 |
from typing import List, Dict, Optional, Tuple
|
| 10 |
from pathlib import Path
|
| 11 |
|
|
|
|
| 13 |
from pydub import AudioSegment
|
| 14 |
# 引入与 sample.py 一致的组件
|
| 15 |
try:
|
| 16 |
+
from diffsinger_utau.voice_bank import PredAll
|
| 17 |
+
from diffsinger_utau.voice_bank.commons.ds_reader import DSReader
|
| 18 |
+
from diffsinger_utau.voice_bank.commons.phome_num_counter import Phome
|
| 19 |
from pypinyin import pinyin, Style
|
| 20 |
from pypinyin.constants import RE_HANS
|
| 21 |
except Exception:
|
|
|
|
| 167 |
seg.export(str(path), format="wav")
|
| 168 |
|
| 169 |
|
| 170 |
+
def overlay_bgm_snippet(vocal_wav: Path, bgm_audio: AudioSegment, offset_sec: float, bgm_volume: float = 1.0, vocal_gain_db: float = 0.0) -> AudioSegment:
|
| 171 |
vocal = audiosegment_from_file(vocal_wav)
|
| 172 |
+
if vocal_gain_db != 0.0:
|
| 173 |
+
vocal = vocal + vocal_gain_db
|
| 174 |
start_ms = max(int(offset_sec * 1000), 0)
|
| 175 |
+
# 应用 BGM 音量倍率
|
| 176 |
+
if bgm_volume <= 0.0:
|
| 177 |
+
base = AudioSegment.silent(duration=start_ms + len(vocal))
|
| 178 |
+
else:
|
| 179 |
+
gain_db = 20.0 * math.log10(bgm_volume)
|
| 180 |
+
bgm_audio = bgm_audio + gain_db
|
| 181 |
+
base = bgm_audio
|
| 182 |
+
# 确保底轨足够长
|
| 183 |
+
if len(base) < start_ms + len(vocal):
|
| 184 |
+
pad_ms = start_ms + len(vocal) - len(base)
|
| 185 |
+
base = base + AudioSegment.silent(duration=pad_ms)
|
| 186 |
+
mixed = base.overlay(vocal, position=start_ms)
|
| 187 |
return mixed[start_ms : start_ms + len(vocal)]
|
| 188 |
|
| 189 |
|
|
|
|
| 202 |
return timeline
|
| 203 |
|
| 204 |
|
| 205 |
+
def mix_full_song(vocal: AudioSegment, bgm: AudioSegment, bgm_volume: float = 1.0) -> AudioSegment:
|
| 206 |
# 保证两者同长度
|
| 207 |
if len(bgm) < len(vocal):
|
| 208 |
bgm = bgm + AudioSegment.silent(duration=(len(vocal) - len(bgm)))
|
| 209 |
else:
|
| 210 |
vocal = vocal + AudioSegment.silent(duration=(len(bgm) - len(vocal)))
|
| 211 |
+
# 应用 BGM 音量倍率
|
| 212 |
+
if bgm_volume <= 0.0:
|
| 213 |
+
bgm_adj = AudioSegment.silent(duration=len(bgm))
|
| 214 |
+
else:
|
| 215 |
+
gain_db = 20.0 * math.log10(bgm_volume)
|
| 216 |
+
bgm_adj = bgm + gain_db
|
| 217 |
+
return bgm_adj.overlay(vocal)
|
| 218 |
|
| 219 |
|
| 220 |
def param_hash(model_sel: str, speaker: str, key_shift: int, steps: int, text: str) -> str:
|
|
|
|
| 404 |
def on_select_template(template_name: str):
|
| 405 |
mapping = find_templates()
|
| 406 |
if not template_name or template_name not in mapping:
|
| 407 |
+
return gr.update(visible=False, value=0.0), [], []
|
| 408 |
p = mapping[template_name]
|
| 409 |
bgm = bgm_path_for(p)
|
| 410 |
ds = load_ds(p)
|
| 411 |
lines = [preprocess_zh_spaces(item.get("text", "")) for item in ds]
|
| 412 |
offsets = [float(item.get("offset", 0.0)) for item in ds]
|
| 413 |
+
bgm_update = gr.update(visible=(bgm is not None), value=(1.0 if bgm is not None else 0.0))
|
| 414 |
return bgm_update, lines, offsets
|
| 415 |
|
| 416 |
|
|
|
|
| 422 |
speaker: str,
|
| 423 |
key_shift: int,
|
| 424 |
steps: int,
|
| 425 |
+
bgm_volume: float,
|
| 426 |
) -> Tuple[Optional[str], Optional[str]]:
|
| 427 |
"""
|
| 428 |
渲染单句,返回:(音频路径, 错误消息)
|
|
|
|
| 464 |
|
| 465 |
# 是否混音预览
|
| 466 |
mapping_bgm = bgm_path_for(template_path)
|
| 467 |
+
if (bgm_volume and bgm_volume > 0.0) and mapping_bgm and mapping_bgm.exists():
|
| 468 |
bgm_audio = audiosegment_from_file(mapping_bgm)
|
| 469 |
offset = float(ds[line_index].get("offset", 0.0))
|
| 470 |
+
mixed_seg = overlay_bgm_snippet(wav_out, bgm_audio, offset, bgm_volume)
|
| 471 |
preview_wav = cache_dir / f"line_{line_index+1}_preview.wav"
|
| 472 |
export_wav(mixed_seg, preview_wav)
|
| 473 |
return str(preview_wav), None
|
|
|
|
| 485 |
speaker: str,
|
| 486 |
key_shift: int,
|
| 487 |
steps: int,
|
| 488 |
+
bgm_volume: float,
|
| 489 |
) -> Tuple[Optional[str], Optional[str]]:
|
| 490 |
"""
|
| 491 |
生成整曲,返回:(主���出音频路径, 混音输出音频路径或错误消息字符串)
|
|
|
|
| 536 |
# 混音版本(如有 BGM 且开启开关)
|
| 537 |
mixed_path = None
|
| 538 |
bgm_p = bgm_path_for(template_path)
|
| 539 |
+
if (bgm_volume and bgm_volume > 0.0) and bgm_p and bgm_p.exists():
|
| 540 |
bgm_audio = audiosegment_from_file(bgm_p)
|
| 541 |
+
mixed = mix_full_song(vocal_full, bgm_audio, bgm_volume)
|
| 542 |
mixed_path = final_dir / f"{template_name}_mixed_{ts_tag}.wav"
|
| 543 |
export_wav(mixed, mixed_path)
|
| 544 |
|
|
|
|
| 602 |
download_btn = gr.DownloadButton(label="下载当前ds状态", elem_classes=["compact-btn"])
|
| 603 |
|
| 604 |
with gr.Row():
|
| 605 |
+
bgm_volume = gr.Slider(0.0, 2.0, value=1.0, step=0.01, label="BGM音量倍率", visible=False)
|
| 606 |
key_shift = gr.Slider(-12, 12, value=0, step=1, label="音高偏移")
|
| 607 |
steps = gr.Slider(1, 50, value=4, step=1, label="渲染步数")
|
| 608 |
speaker = gr.Dropdown(label="演唱者", choices=[], value=None, interactive=True)
|
|
|
|
| 693 |
outputs=[speaker],
|
| 694 |
)
|
| 695 |
|
| 696 |
+
# BGM 音量倍率变化:控制“整首(混音)”可见性(需当前模板存在 BGM 且倍率>0)
|
| 697 |
+
def on_bgm_volume_change(bgm_vol, template_name):
|
| 698 |
mapping = find_templates()
|
| 699 |
has_bgm = False
|
| 700 |
if template_name in mapping:
|
| 701 |
has_bgm = bgm_path_for(mapping[template_name]) is not None
|
| 702 |
+
return gr.update(visible=bool(bgm_vol and bgm_vol > 0 and has_bgm))
|
| 703 |
|
| 704 |
+
bgm_volume.change(
|
| 705 |
+
fn=on_bgm_volume_change,
|
| 706 |
+
inputs=[bgm_volume, template_sel],
|
| 707 |
outputs=[full_mixed],
|
| 708 |
)
|
| 709 |
|
|
|
|
| 711 |
template_sel.change(
|
| 712 |
fn=on_template_change,
|
| 713 |
inputs=[template_sel],
|
| 714 |
+
outputs=[bgm_volume, template_sel, lines_state, offsets_state, per_line_error, full_mixed, *textboxes],
|
| 715 |
)
|
| 716 |
|
| 717 |
# 上传模板:将用户 .ds 保存到 templates/user,并刷新模板下拉
|
|
|
|
| 752 |
# 文本提交事件:逐句渲染(为预创建文本框绑定)
|
| 753 |
for idx, tb in enumerate(textboxes):
|
| 754 |
def make_submit(i):
|
| 755 |
+
def _submit(new_text, lines_list, model_sel_v, template_sel_v, speaker_v, key_shift_v, steps_v, bgm_volume_v):
|
| 756 |
# 仅处理当前可见范围内的行
|
| 757 |
if not isinstance(lines_list, list) or i >= max(len(lines_list), 0):
|
| 758 |
return gr.update(), gr.update(), lines_list
|
| 759 |
+
audio_path, err = render_single_line(model_sel_v, template_sel_v, i, new_text, speaker_v, key_shift_v, steps_v, bgm_volume_v)
|
| 760 |
if i < len(lines_list):
|
| 761 |
lines_list[i] = new_text
|
| 762 |
if err:
|
|
|
|
| 765 |
return _submit
|
| 766 |
tb.submit(
|
| 767 |
fn=make_submit(idx),
|
| 768 |
+
inputs=[tb, lines_state, model_sel, template_sel, speaker, key_shift, steps, bgm_volume],
|
| 769 |
outputs=[per_line_audio, per_line_error, lines_state],
|
| 770 |
)
|
| 771 |
|
| 772 |
# 生成整首(支持进度与中断)
|
| 773 |
+
def on_gen_or_stop(model_sel_v, template_sel_v, speaker_v, key_shift_v, steps_v, bgm_volume_v, lines, stop, generating, progress=gr.Progress(track_tqdm=True)):
|
| 774 |
# 若正在生成,本次点击作为“停止”信号,仅更新按钮与提示
|
| 775 |
if generating:
|
| 776 |
stop = True
|
|
|
|
| 836 |
|
| 837 |
mixed_path = None
|
| 838 |
bgm_p = bgm_path_for(template_path)
|
| 839 |
+
if (bgm_volume_v and bgm_volume_v > 0.0) and bgm_p and bgm_p.exists():
|
| 840 |
+
mixed = mix_full_song(vocal_full, audiosegment_from_file(bgm_p), bgm_volume_v)
|
| 841 |
mixed_path = final_dir / f"{template_sel_v}_mixed_{ts_tag}.wav"
|
| 842 |
export_wav(mixed, mixed_path)
|
| 843 |
|
|
|
|
| 848 |
|
| 849 |
gen_btn.click(
|
| 850 |
fn=on_gen_or_stop,
|
| 851 |
+
inputs=[model_sel, template_sel, speaker, key_shift, steps, bgm_volume, lines_state, stop_flag, generating_flag],
|
| 852 |
outputs=[full_vocal, full_mixed, gen_btn, progress_md, stop_flag, generating_flag],
|
| 853 |
)
|
| 854 |
|
|
|
|
| 903 |
demo.load(
|
| 904 |
fn=on_app_load,
|
| 905 |
inputs=[model_sel, template_sel],
|
| 906 |
+
outputs=[speaker, bgm_volume, template_sel, lines_state, offsets_state, per_line_error, full_mixed, *textboxes],
|
| 907 |
)
|
| 908 |
|
| 909 |
return demo
|
| 910 |
|
| 911 |
|
| 912 |
def main():
|
| 913 |
+
parser = argparse.ArgumentParser()
|
| 914 |
+
parser.add_argument("--host", type=str, default="127.0.0.1")
|
| 915 |
+
parser.add_argument("--port", type=int, default=7860)
|
| 916 |
+
args = parser.parse_args()
|
| 917 |
+
|
| 918 |
demo = build_ui()
|
| 919 |
+
demo.launch(server_name=args.host, server_port=args.port, show_error=True)
|
| 920 |
|
| 921 |
|
| 922 |
if __name__ == "__main__":
|