SoulX-Singer / webui.py
Xinsheng-Wang's picture
Upload folder using huggingface_hub
aeb9bb1 verified
import os
import re
import random
import shutil
import sys
import traceback
from pathlib import Path
from typing import Literal, Tuple
import numpy as np
import torch
import librosa
import soundfile as sf
import gradio as gr
# Import spaces for ZeroGPU support (if available)
try:
import spaces
except ImportError:
# spaces module not available (not running on HF Spaces or not ZeroGPU)
spaces = None
from preprocess.pipeline import PreprocessPipeline
from soulxsinger.utils.file_utils import load_config
from cli.inference import build_model as build_svs_model, process as svs_process
ROOT = Path(__file__).parent
ENGLISH_EXAMPLE_PROMPT_AUDIO = "example/audio/en_prompt.mp3"
ENGLISH_EXAMPLE_PROMPT_META = "example/audio/en_prompt.json"
ENGLISH_EXAMPLE_TARGET_AUDIO = "example/audio/en_target.mp3"
ENGLISH_EXAMPLE_TARGET_META = "example/audio/en_target.json"
MANDARIN_EXAMPLE_PROMPT_AUDIO = "example/audio/zh_prompt.mp3"
MANDARIN_EXAMPLE_PROMPT_META = "example/audio/zh_prompt.json"
MANDARIN_EXAMPLE_TARGET_AUDIO = "example/audio/zh_target.mp3"
MANDARIN_EXAMPLE_TARGET_META = "example/audio/zh_target.json"
CANTONESE_EXAMPLE_PROMPT_AUDIO = "example/audio/yue_prompt.mp3"
CANTONESE_EXAMPLE_PROMPT_META = "example/audio/yue_prompt.json"
CANTONESE_EXAMPLE_TARGET_AUDIO = "example/audio/yue_target.mp3"
CANTONESE_EXAMPLE_TARGET_META = "example/audio/yue_target.json"
MUSIC_EXAMPLE_TARGET_AUDIO = "example/audio/music.mp3"
MUSIC_EXAMPLE_TARGET_META = "example/audio/music.json"
# Lyric language: value (Mandarin/Cantonese/English) is passed to PreprocessPipeline; display labels from i18n via get_lyric_lang_choices()
# Use absolute paths so Examples load correctly (including File components for metadata)
EXAMPLES_LIST = [
[
str(ROOT / MANDARIN_EXAMPLE_PROMPT_AUDIO),
str(ROOT / MANDARIN_EXAMPLE_TARGET_AUDIO),
str(ROOT / MANDARIN_EXAMPLE_PROMPT_META),
str(ROOT / MANDARIN_EXAMPLE_TARGET_META),
"Mandarin",
"Mandarin",
"melody",
False,
True,
True,
0,
],
[
str(ROOT / MANDARIN_EXAMPLE_PROMPT_AUDIO),
str(ROOT / CANTONESE_EXAMPLE_TARGET_AUDIO),
str(ROOT / MANDARIN_EXAMPLE_PROMPT_META),
str(ROOT / CANTONESE_EXAMPLE_TARGET_META),
"Mandarin",
"Cantonese",
"melody",
False,
True,
True,
0,
],
[
str(ROOT / MANDARIN_EXAMPLE_PROMPT_AUDIO),
str(ROOT / ENGLISH_EXAMPLE_TARGET_AUDIO),
str(ROOT / MANDARIN_EXAMPLE_PROMPT_META),
str(ROOT / ENGLISH_EXAMPLE_TARGET_META),
"Mandarin",
"English",
"melody",
False,
True,
True,
0,
],
[
str(ROOT / MANDARIN_EXAMPLE_PROMPT_AUDIO),
str(ROOT / MUSIC_EXAMPLE_TARGET_AUDIO),
str(ROOT / MANDARIN_EXAMPLE_PROMPT_META),
str(ROOT / MUSIC_EXAMPLE_TARGET_META),
"Mandarin",
"Mandarin",
"melody",
False,
True,
True,
0,
],
]
def _load_example(choice_value):
"""Return 11 example values + skip_clear_count (2 when loading example so next 2 audio.change events don't clear metadata).
choice_value: selected dropdown string (or index in older flow); map to example index 0/1/2."""
if choice_value is None:
return [gr.update()] * 11 + [0]
idx = 0
if isinstance(choice_value, int):
idx = 0 if choice_value <= 0 else min(choice_value - 1, len(EXAMPLES_LIST) - 1)
else:
if choice_value == i18n("example_choice_1"):
idx = 1
elif choice_value == i18n("example_choice_2"):
idx = 2
elif choice_value == i18n("example_choice_3"):
idx = 3
elif choice_value == i18n("example_choice_4"):
idx = 4
if idx <= 0:
return [gr.update()] * 11 + [0]
list_idx = idx - 1
if list_idx >= len(EXAMPLES_LIST):
return [gr.update()] * 11 + [0]
row = EXAMPLES_LIST[list_idx]
return [
row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7], row[8], row[9], row[10],
2, # skip_clear_metadata_count: next 2 audio.change events (prompt + target) will not clear metadata
]
def _clear_prompt_meta_unless_example(_audio, skip_count):
if skip_count and skip_count > 0:
return gr.skip(), max(0, skip_count - 1)
return None, 0
def _clear_target_meta_unless_example(_audio, skip_count):
if skip_count and skip_count > 0:
return gr.skip(), max(0, skip_count - 1)
return None, 0
def _get_device() -> str:
"""Use CUDA if available, else CPU (e.g. for CI or CPU-only environments)."""
return "cuda:0" if torch.cuda.is_available() else "cpu"
def _session_dir_from_target(target_audio_path: str) -> Path:
stem = Path(target_audio_path).stem
safe = re.sub(r"[^\w\-]", "_", stem)
safe = re.sub(r"_+", "_", safe).strip("_") or "session"
return ROOT / "outputs" / "gradio" / safe[:64]
class AppState:
def __init__(self) -> None:
self.device = _get_device()
self.preprocess_pipeline = PreprocessPipeline(
device=self.device,
language="Mandarin",
save_dir=str(ROOT / "outputs" / "gradio" / "_placeholder" / "transcriptions"),
vocal_sep=True,
max_merge_duration=60000,
)
config = load_config("soulxsinger/config/soulxsinger.yaml")
self.svs_config = config
self.svs_model = build_svs_model(
model_path="pretrained_models/SoulX-Singer/model.pt",
config=config,
device=self.device,
)
self.phoneset_path = "soulxsinger/utils/phoneme/phone_set.json"
def run_preprocess(
self,
prompt_path: Path,
target_path: Path,
session_base: Path,
prompt_vocal_sep: bool,
target_vocal_sep: bool,
prompt_lyric_lang: str,
target_lyric_lang: str,
) -> Tuple[bool, str]:
try:
self.preprocess_pipeline.save_dir = str(session_base / "transcriptions" / "prompt")
self.preprocess_pipeline.run(
audio_path=str(prompt_path),
vocal_sep=prompt_vocal_sep,
max_merge_duration=20000,
language=prompt_lyric_lang or "Mandarin",
)
self.preprocess_pipeline.save_dir = str(session_base / "transcriptions" / "target")
self.preprocess_pipeline.run(
audio_path=str(target_path),
vocal_sep=target_vocal_sep,
max_merge_duration=60000,
language=target_lyric_lang or "Mandarin",
)
return True, "preprocess done"
except Exception as e:
return False, f"preprocess failed: {e}"
def run_svs(
self,
control: str,
session_base: Path,
auto_shift: bool,
pitch_shift: int,
) -> Tuple[bool, str, Path | None, Path | None, Path | None]:
if control not in ("melody", "score"):
control = "score"
save_dir = session_base / "generated"
save_dir.mkdir(parents=True, exist_ok=True)
class Args:
pass
args = Args()
args.device = self.device
args.model_path = "pretrained_models/SoulX-Singer/model.pt"
args.config = "soulxsinger/config/soulxsinger.yaml"
args.prompt_wav_path = str(session_base / "audio" / "prompt.wav")
prompt_meta_path = session_base / "transcriptions" / "prompt" / "metadata.json"
target_meta_path = session_base / "transcriptions" / "target" / "metadata.json"
args.prompt_metadata_path = str(prompt_meta_path)
args.target_metadata_path = str(target_meta_path)
args.phoneset_path = self.phoneset_path
args.save_dir = str(save_dir)
args.auto_shift = auto_shift
args.pitch_shift = int(pitch_shift)
args.control = control
try:
svs_process(args, self.svs_config, self.svs_model)
generated = save_dir / "generated.wav"
if not generated.exists():
return False, f"inference finished but {generated} not found", None, prompt_meta_path, target_meta_path
return True, "svs inference done", generated, prompt_meta_path, target_meta_path
except Exception as e:
return False, f"svs inference failed: {e}", None, prompt_meta_path, target_meta_path
def run_svs_from_paths(
self,
prompt_wav_path: str,
prompt_metadata_path: str,
target_metadata_path: str,
control: str,
auto_shift: bool,
pitch_shift: int,
save_dir: Path | None = None,
) -> Tuple[bool, str, Path | None]:
"""Run SVS from explicit prompt wav and metadata paths."""
if save_dir is None:
import uuid
save_dir = ROOT / "outputs" / "gradio" / "synthesis" / str(uuid.uuid4())[:8]
save_dir = Path(save_dir)
audio_dir = save_dir / "audio"
prompt_meta_dir = save_dir / "transcriptions" / "prompt"
target_meta_dir = save_dir / "transcriptions" / "target"
audio_dir.mkdir(parents=True, exist_ok=True)
prompt_meta_dir.mkdir(parents=True, exist_ok=True)
target_meta_dir.mkdir(parents=True, exist_ok=True)
shutil.copy2(prompt_wav_path, audio_dir / "prompt.wav")
shutil.copy2(prompt_metadata_path, prompt_meta_dir / "metadata.json")
shutil.copy2(target_metadata_path, target_meta_dir / "metadata.json")
ok, msg, merged, _, _ = self.run_svs(
control=control,
session_base=save_dir,
auto_shift=auto_shift,
pitch_shift=pitch_shift,
)
if not ok or merged is None:
return False, msg or "svs failed", None
return True, "svs inference done", merged
# Lazy initialization for ZeroGPU support
_app_state = None
def get_app_state():
"""Get or create AppState instance. For ZeroGPU, this will be decorated with @spaces.GPU."""
global _app_state
if _app_state is None:
_app_state = AppState()
return _app_state
# Decorate get_app_state for ZeroGPU (other functions will be decorated after they are defined)
if spaces is not None:
get_app_state = spaces.GPU()(get_app_state)
# Initialize AppState at module level (for non-ZeroGPU environments)
# For ZeroGPU, initialization will happen lazily in functions decorated with @spaces.GPU
if spaces is None:
APP_STATE = AppState()
else:
# For ZeroGPU, use lazy initialization
APP_STATE = None
# i18n
_i18n_key2lang_dict = dict(
display_lang_label=dict(en="Display Language", zh="显示语言"),
seed_label=dict(en="Seed", zh="种子"),
prompt_audio_label=dict(en="Prompt audio (reference voice), limit to 30 seconds", zh="Prompt 音频(参考音色),限制在 30 秒以内"),
target_audio_label=dict(en="Target audio (melody / lyrics source), limit to 60 seconds", zh="Target 音频(旋律/歌词来源),限制在 60 秒以内"),
generate_btn_label=dict(en="Start SVS", zh="开始 SVS"),
transcription_btn_label=dict(en="Run singing transcription", zh="开始歌声转录"),
synthesis_btn_label=dict(en="Run singing synthesis", zh="歌声合成"),
prompt_meta_label=dict(en="Prompt metadata", zh="Prompt metadata"),
target_meta_label=dict(en="Target metadata", zh="Target metadata"),
edit_tutorial_html=dict(
en='<p class="mb-0">Refer to <a href="https://github.com/Soul-AILab/SoulX-Singer/tree/main/preprocess#step-2-edit-in-the-midi-editor" target="_blank" rel="noopener">Edit Tutorial</a> for metadata editing (Important Note: The generated metadata may not perfectly align the singing audio with the corresponding lyrics and musical notes. For better results, we strongly recommend manually correcting the alignment. You can directly use <a href="https://huggingface.co/spaces/Soul-AILab/SoulX-Singer-Midi-Editor" target="_blank" rel="noopener">SoulX-Singer-Midi-Editor</a> to edit) </p>',
zh='<p class="mb-0">metadata 编辑请参考 <a href="https://github.com/Soul-AILab/SoulX-Singer/tree/main/preprocess#step-2-edit-in-the-midi-editor" target="_blank" rel="noopener">编辑教程</a> (重要提示:自动生成的 metadata 在音频与歌词、音高对齐效果通常不理想。为了获得更好的结果,我们强烈建议手动纠正对齐,否则会导致合成效果不佳。 你可以直接使用 <a href="https://huggingface.co/spaces/Soul-AILab/SoulX-Singer-Midi-Editor" target="_blank" rel="noopener">SoulX-Singer-Midi-Editor</a> 进行编辑) </p>',
),
prompt_wav_label=dict(en="Prompt WAV (reference)", zh="Prompt WAV(参考音色)"),
generated_audio_label=dict(en="Generated merged audio", zh="合成结果音频"),
prompt_lyric_lang_label=dict(en="Prompt lyric language", zh="Prompt 歌词语种"),
target_lyric_lang_label=dict(en="Target lyric language", zh="Target 歌词语种"),
lyric_lang_mandarin=dict(en="Mandarin", zh="普通话"),
lyric_lang_cantonese=dict(en="Cantonese", zh="粤语"),
lyric_lang_english=dict(en="English", zh="英语"),
warn_missing_synthesis=dict(en="Please provide prompt WAV, prompt metadata, and target metadata", zh="请提供 Prompt WAV、Prompt metadata 与 Target metadata"),
prompt_vocal_sep_label=dict(en="Prompt vocal separation", zh="Prompt人声分离"),
target_vocal_sep_label=dict(en="Target vocal separation", zh="Target人声分离"),
auto_shift_label=dict(en="Auto pitch shift", zh="自动变调"),
pitch_shift_label=dict(en="Pitch shift (semitones)", zh="指定变调(半音)"),
control_type_label=dict(en="Control type", zh="控制类型"),
examples_label=dict(en="Reference examples (click to load)", zh="参考样例(点击加载)"),
example_choice_0=dict(en="—", zh="—"),
example_choice_1=dict(en="Example 1: Mandarin → Mandarin (melody), Start singing synthesis!", zh="样例 1: 普通话 → 普通话 (melody), 开始歌声合成吧!"),
example_choice_2=dict(en="Example 2: Mandarin → Cantonese (melody), Start singing synthesis!", zh="样例 2: 普通话 → 粤语 (melody), 开始歌声合成吧!"),
example_choice_3=dict(en="Example 3: Mandarin → English (melody), Start singing synthesis!", zh="样例 3: 普通话 → 英语 (melody), 开始歌声合成吧!"),
example_choice_4=dict(en="Example 4: Mandarin → Music (score), Start singing synthesis!", zh="样例 4: 普通话 → 音乐 (score), 开始歌声合成吧!"),
warn_missing_audio=dict(
en="Please upload both prompt audio and target audio",
zh="请上传 Prompt 音频与 Target 音频",
),
# Instruction panel (workflow description)
instruction_title=dict(en="Usage", zh="使用说明"),
instruction_p1=dict(
en="After uploading prompt and target audio and clicking **Run singing transcription**, the system generates two metadata files (prompt and target).",
zh="上传 Prompt 与 Target 音频并点击「开始歌声转录」后,将生成 Prompt 与 Target 两份 metadata 文件。",
),
instruction_p2=dict(
en="Auto-transcribed lyrics and notes are often misaligned. For better results, import the generated metadata into the **MIDI Editor** for manual adjustment: [SoulX-Singer-Midi-Editor](https://huggingface.co/spaces/Soul-AILab/SoulX-Singer-Midi-Editor).",
zh="自动转录的歌词与音高对齐效果通常不理想,建议将生成的 metadata 导入 **MIDI 编辑器** 进行手动调整:[SoulX-Singer-Midi-Editor](https://huggingface.co/spaces/Soul-AILab/SoulX-Singer-Midi-Editor)。",
),
instruction_p3=dict(
en="Re-upload the adjusted metadata to the corresponding Prompt / Target Meta fields, then click **Run singing synthesis** to generate the final audio.",
zh="将调整后的 metadata 重新上传至对应的 Prompt / Target Meta 位置后,点击「歌声合成」开始最终生成。",
),
)
def _detect_initial_lang() -> Literal["zh", "en"]:
"""Detect initial UI language from server locale (browser language applied later via JS)."""
try:
import locale
loc = (locale.getdefaultlocale()[0] or os.environ.get("LANG", "") or "").lower()
return "en" if loc.startswith("en") else "zh"
except Exception:
return "zh"
global_lang: Literal["zh", "en"] = _detect_initial_lang()
def i18n(key: str) -> str:
return _i18n_key2lang_dict[key][global_lang]
def get_lyric_lang_choices():
"""Lyric language dropdown (display, value) for current UI language."""
return [
(i18n("lyric_lang_mandarin"), "Mandarin"),
(i18n("lyric_lang_cantonese"), "Cantonese"),
(i18n("lyric_lang_english"), "English"),
]
def _resolve_file_path(x):
"""Gradio file input can be path string or (path, None) tuple."""
if x is None:
return None
if isinstance(x, tuple):
x = x[0]
return x if (x and os.path.isfile(x)) else None
def transcription_function(
prompt_audio,
target_audio,
prompt_metadata,
target_metadata,
prompt_lyric_lang: str,
target_lyric_lang: str,
prompt_vocal_sep: bool,
target_vocal_sep: bool,
):
"""Step 1: Run transcription only; output (prompt_meta_path, target_meta_path)."""
global APP_STATE
if APP_STATE is None:
APP_STATE = get_app_state()
try:
if isinstance(prompt_audio, tuple):
prompt_audio = prompt_audio[0]
if isinstance(target_audio, tuple):
target_audio = target_audio[0]
if prompt_audio is None or target_audio is None:
gr.Warning(message=i18n("warn_missing_audio"))
return None, None
prompt_meta_resolved = _resolve_file_path(prompt_metadata)
target_meta_resolved = _resolve_file_path(target_metadata)
use_input_metadata = prompt_meta_resolved is not None and target_meta_resolved is not None
session_base = _session_dir_from_target(target_audio)
audio_dir = session_base / "audio"
audio_dir.mkdir(parents=True, exist_ok=True)
transfer_prompt_path = audio_dir / "prompt.wav"
transfer_target_path = audio_dir / "target.wav"
SR = 44100
PROMPT_MAX_SEC = 30
TARGET_MAX_SEC = 60
prompt_audio_data, _ = librosa.load(prompt_audio, sr=SR, mono=True)
target_audio_data, _ = librosa.load(target_audio, sr=SR, mono=True)
prompt_audio_data = prompt_audio_data[: PROMPT_MAX_SEC * SR]
target_audio_data = target_audio_data[: TARGET_MAX_SEC * SR]
sf.write(transfer_prompt_path, prompt_audio_data, SR)
sf.write(transfer_target_path, target_audio_data, SR)
prompt_meta_path = session_base / "transcriptions" / "prompt" / "metadata.json"
target_meta_path = session_base / "transcriptions" / "target" / "metadata.json"
if use_input_metadata:
(session_base / "transcriptions" / "prompt").mkdir(parents=True, exist_ok=True)
(session_base / "transcriptions" / "target").mkdir(parents=True, exist_ok=True)
shutil.copy2(prompt_meta_resolved, prompt_meta_path)
shutil.copy2(target_meta_resolved, target_meta_path)
else:
ok, msg = APP_STATE.run_preprocess(
transfer_prompt_path,
transfer_target_path,
session_base,
prompt_vocal_sep=prompt_vocal_sep,
target_vocal_sep=target_vocal_sep,
prompt_lyric_lang=prompt_lyric_lang or "Mandarin",
target_lyric_lang=target_lyric_lang or "Mandarin",
)
if not ok:
print(msg, file=sys.stderr, flush=True)
return None, None
prompt_meta_file = str(prompt_meta_path) if prompt_meta_path.exists() else None
target_meta_file = str(target_meta_path) if target_meta_path.exists() else None
return prompt_meta_file, target_meta_file
except Exception:
print(traceback.format_exc(), file=sys.stderr, flush=True)
return None, None
def synthesis_function(
prompt_audio,
prompt_metadata,
target_metadata,
control: str,
auto_shift: bool,
pitch_shift,
seed: int,
):
"""Step 2: Run SVS from top prompt_audio + prompt_metadata + target_metadata."""
global APP_STATE
if APP_STATE is None:
APP_STATE = get_app_state()
try:
if isinstance(prompt_audio, tuple):
prompt_audio = prompt_audio[0]
prompt_wav_path = prompt_audio
prompt_meta_path = _resolve_file_path(prompt_metadata)
target_meta_path = _resolve_file_path(target_metadata)
if not prompt_wav_path or not os.path.isfile(prompt_wav_path):
gr.Warning(message=i18n("warn_missing_synthesis"))
return None
if not prompt_meta_path or not os.path.isfile(prompt_meta_path):
gr.Warning(message=i18n("warn_missing_synthesis"))
return None
if not target_meta_path or not os.path.isfile(target_meta_path):
gr.Warning(message=i18n("warn_missing_synthesis"))
return None
if control not in ("melody", "score"):
control = "score"
seed = int(seed)
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
ok, msg, merged = APP_STATE.run_svs_from_paths(
prompt_wav_path=prompt_wav_path,
prompt_metadata_path=prompt_meta_path,
target_metadata_path=target_meta_path,
control=control,
auto_shift=auto_shift,
pitch_shift=int(pitch_shift),
)
if not ok or merged is None:
print(msg or "synthesis failed", file=sys.stderr, flush=True)
return None
return str(merged)
except Exception:
print(traceback.format_exc(), file=sys.stderr, flush=True)
return None
# Decorate GPU-using functions for ZeroGPU (after they are defined)
if spaces is not None:
transcription_function = spaces.GPU()(transcription_function)
synthesis_function = spaces.GPU()(synthesis_function)
def _instruction_md() -> str:
"""Markdown content for the instruction panel (supports links)."""
return "\n\n".join([
f"**1.** {i18n('instruction_p1')}",
f"**2.** {i18n('instruction_p2')}",
f"**3.** {i18n('instruction_p3')}",
])
def render_interface() -> gr.Blocks:
with gr.Blocks(title="SoulX-Singer 歌声合成Demo", theme=gr.themes.Default()) as page:
gr.HTML(
'<div style="'
'text-align: center; '
'padding: 1.25rem 0 1.5rem; '
'margin-bottom: 0.5rem;'
'">'
'<div style="'
'display: inline-block; '
'font-size: 1.75rem; '
'font-weight: 700; '
'letter-spacing: 0.02em; '
'color: #1a1a2e; '
'line-height: 1.3;'
'">SoulX-Singer</div>'
'<div style="'
'width: 80px; '
'height: 3px; '
'margin: 1rem auto 0; '
'background: linear-gradient(90deg, transparent, #6366f1, transparent); '
'border-radius: 2px;'
'"></div>'
'</div>'
)
# Auto-detect browser language: run after Gradio mounts
gr.HTML(
'<script type="text/javascript">'
'(function(){'
'function setLang(){'
'var lang=(navigator.language||navigator.userLanguage||"").toLowerCase();'
'if(lang.startsWith("en")){'
'var inputs=document.querySelectorAll("#lang_choice_radio input");'
'if(inputs.length>1)inputs[1].click();'
'}'
'}'
'if(document.readyState==="complete")setTimeout(setLang,800);'
'else window.addEventListener("load",function(){setTimeout(setLang,800);});'
'})();'
'</script>',
visible=False,
)
with gr.Row(equal_height=True):
lang_choice = gr.Radio(
choices=["中文", "English"],
value="中文",
label=i18n("display_lang_label"),
type="index",
interactive=True,
elem_id="lang_choice_radio",
)
# Instruction panel (usage workflow); updates on language change
instruction_md = gr.Markdown(f"### {i18n('instruction_title')}\n\n{_instruction_md()}")
# Reference examples — at the front of operations (handler registered after components exist)
skip_clear_metadata_count = gr.State(0)
with gr.Row():
_example_choices = [i18n("example_choice_0"), i18n("example_choice_1"), i18n("example_choice_2"), i18n("example_choice_3"), i18n("example_choice_4")]
example_choice = gr.Dropdown(
label=i18n("examples_label"),
choices=_example_choices,
value=_example_choices[0],
interactive=True,
)
# Step 1: Transcription (audio → metadata)
with gr.Row(equal_height=True):
with gr.Column(scale=1):
prompt_audio = gr.Audio(
label=i18n("prompt_audio_label"),
type="filepath",
editable=False,
interactive=True,
)
with gr.Column(scale=1):
target_audio = gr.Audio(
label=i18n("target_audio_label"),
type="filepath",
editable=False,
interactive=True,
)
with gr.Row(equal_height=True):
prompt_lyric_lang = gr.Dropdown(
label=i18n("prompt_lyric_lang_label"),
choices=get_lyric_lang_choices(),
value="Mandarin",
interactive=True,
scale=1,
)
target_lyric_lang = gr.Dropdown(
label=i18n("target_lyric_lang_label"),
choices=get_lyric_lang_choices(),
value="Mandarin",
interactive=True,
scale=1,
)
prompt_vocal_sep = gr.Checkbox(
label=i18n("prompt_vocal_sep_label"),
value=False,
interactive=True,
scale=1,
)
target_vocal_sep = gr.Checkbox(
label=i18n("target_vocal_sep_label"),
value=True,
interactive=True,
scale=1,
)
with gr.Row():
transcription_btn = gr.Button(
value=i18n("transcription_btn_label"),
variant="primary",
size="lg",
)
# Edit tutorial link (gr.HTML supports links; component labels do not)
metadata_tutorial_html = gr.HTML(value=i18n("edit_tutorial_html"))
# Synthesis: params row, then synthesis button on next row
with gr.Row(equal_height=True):
prompt_metadata = gr.File(
label=i18n("prompt_meta_label"),
type="filepath",
file_types=[".json"],
interactive=True,
)
target_metadata = gr.File(
label=i18n("target_meta_label"),
type="filepath",
file_types=[".json"],
interactive=True,
)
control_radio = gr.Radio(
choices=["melody", "score"],
value="score",
label=i18n("control_type_label"),
scale=1,
)
auto_shift = gr.Checkbox(
label=i18n("auto_shift_label"),
value=True,
interactive=True,
scale=1,
)
pitch_shift = gr.Number(
label=i18n("pitch_shift_label"),
value=0,
minimum=-36,
maximum=36,
step=1,
interactive=True,
scale=1,
)
seed_input = gr.Number(
label=i18n("seed_label"),
value=12306,
step=1,
interactive=True,
scale=1,
)
with gr.Row():
synthesis_btn = gr.Button(
value=i18n("synthesis_btn_label"),
variant="primary",
size="lg",
)
with gr.Row():
output_audio = gr.Audio(
label=i18n("generated_audio_label"),
type="filepath",
interactive=False,
)
example_choice.change(
fn=_load_example,
inputs=[example_choice],
outputs=[
prompt_audio,
target_audio,
prompt_metadata,
target_metadata,
prompt_lyric_lang,
target_lyric_lang,
control_radio,
prompt_vocal_sep,
target_vocal_sep,
auto_shift,
pitch_shift,
skip_clear_metadata_count,
],
)
def _change_component_language(lang):
global global_lang
global_lang = ["zh", "en"][lang]
choices = get_lyric_lang_choices()
return [
gr.update(label=i18n("prompt_audio_label")),
gr.update(label=i18n("target_audio_label")),
gr.update(label=i18n("prompt_lyric_lang_label"), choices=choices),
gr.update(label=i18n("target_lyric_lang_label"), choices=choices),
gr.update(label=i18n("prompt_vocal_sep_label")),
gr.update(label=i18n("target_vocal_sep_label")),
gr.update(value=i18n("transcription_btn_label")),
gr.update(label=i18n("prompt_meta_label")),
gr.update(label=i18n("target_meta_label")),
gr.update(value=i18n("edit_tutorial_html")),
gr.update(label=i18n("control_type_label")),
gr.update(label=i18n("auto_shift_label")),
gr.update(label=i18n("pitch_shift_label")),
gr.update(label=i18n("seed_label")),
gr.update(value=i18n("synthesis_btn_label")),
gr.update(label=i18n("generated_audio_label")),
gr.update(label=i18n("display_lang_label")),
gr.update(
label=i18n("examples_label"),
choices=[i18n("example_choice_0"), i18n("example_choice_1"), i18n("example_choice_2"), i18n("example_choice_3"), i18n("example_choice_4")],
value=i18n("example_choice_0"),
),
gr.update(value=f"### {i18n('instruction_title')}\n\n{_instruction_md()}"),
]
lang_choice.change(
fn=_change_component_language,
inputs=[lang_choice],
outputs=[
prompt_audio,
target_audio,
prompt_lyric_lang,
target_lyric_lang,
prompt_vocal_sep,
target_vocal_sep,
transcription_btn,
prompt_metadata,
target_metadata,
metadata_tutorial_html,
control_radio,
auto_shift,
pitch_shift,
seed_input,
synthesis_btn,
output_audio,
lang_choice,
example_choice,
instruction_md,
],
)
# Upload new prompt/target audio → clear corresponding metadata; skip clear when change came from load example
prompt_audio.change(
fn=_clear_prompt_meta_unless_example,
inputs=[prompt_audio, skip_clear_metadata_count],
outputs=[prompt_metadata, skip_clear_metadata_count],
)
target_audio.change(
fn=_clear_target_meta_unless_example,
inputs=[target_audio, skip_clear_metadata_count],
outputs=[target_metadata, skip_clear_metadata_count],
)
transcription_btn.click(
fn=transcription_function,
inputs=[
prompt_audio,
target_audio,
prompt_metadata,
target_metadata,
prompt_lyric_lang,
target_lyric_lang,
prompt_vocal_sep,
target_vocal_sep,
],
outputs=[prompt_metadata, target_metadata],
)
synthesis_btn.click(
fn=synthesis_function,
inputs=[
prompt_audio,
prompt_metadata,
target_metadata,
control_radio,
auto_shift,
pitch_shift,
seed_input,
],
outputs=[output_audio],
)
return page
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--port", type=int, default=7860, help="Gradio server port")
parser.add_argument("--share", action="store_true", help="Create public link")
args = parser.parse_args()
page = render_interface()
page.queue()
page.launch(share=args.share, server_name="0.0.0.0", server_port=args.port)