Spaces:

Soul-AILab
/

SoulX-Singer

Running on Zero

App Files Files Community

SoulX-Singer / webui.py

Xinsheng-Wang

Upload folder using huggingface_hub

aeb9bb1 verified about 11 hours ago

raw

history blame contribute delete

33.7 kB

	import os
	import re
	import random
	import shutil
	import sys
	import traceback
	from pathlib import Path
	from typing import Literal, Tuple

	import numpy as np
	import torch
	import librosa
	import soundfile as sf
	import gradio as gr

	# Import spaces for ZeroGPU support (if available)
	try:
	import spaces
	except ImportError:
	# spaces module not available (not running on HF Spaces or not ZeroGPU)
	spaces = None

	from preprocess.pipeline import PreprocessPipeline
	from soulxsinger.utils.file_utils import load_config
	from cli.inference import build_model as build_svs_model, process as svs_process


	ROOT = Path(__file__).parent

	ENGLISH_EXAMPLE_PROMPT_AUDIO = "example/audio/en_prompt.mp3"
	ENGLISH_EXAMPLE_PROMPT_META = "example/audio/en_prompt.json"
	ENGLISH_EXAMPLE_TARGET_AUDIO = "example/audio/en_target.mp3"
	ENGLISH_EXAMPLE_TARGET_META = "example/audio/en_target.json"

	MANDARIN_EXAMPLE_PROMPT_AUDIO = "example/audio/zh_prompt.mp3"
	MANDARIN_EXAMPLE_PROMPT_META = "example/audio/zh_prompt.json"
	MANDARIN_EXAMPLE_TARGET_AUDIO = "example/audio/zh_target.mp3"
	MANDARIN_EXAMPLE_TARGET_META = "example/audio/zh_target.json"

	CANTONESE_EXAMPLE_PROMPT_AUDIO = "example/audio/yue_prompt.mp3"
	CANTONESE_EXAMPLE_PROMPT_META = "example/audio/yue_prompt.json"
	CANTONESE_EXAMPLE_TARGET_AUDIO = "example/audio/yue_target.mp3"
	CANTONESE_EXAMPLE_TARGET_META = "example/audio/yue_target.json"

	MUSIC_EXAMPLE_TARGET_AUDIO = "example/audio/music.mp3"
	MUSIC_EXAMPLE_TARGET_META = "example/audio/music.json"

	# Lyric language: value (Mandarin/Cantonese/English) is passed to PreprocessPipeline; display labels from i18n via get_lyric_lang_choices()

	# Use absolute paths so Examples load correctly (including File components for metadata)
	EXAMPLES_LIST = [
	[
	str(ROOT / MANDARIN_EXAMPLE_PROMPT_AUDIO),
	str(ROOT / MANDARIN_EXAMPLE_TARGET_AUDIO),
	str(ROOT / MANDARIN_EXAMPLE_PROMPT_META),
	str(ROOT / MANDARIN_EXAMPLE_TARGET_META),
	"Mandarin",
	"Mandarin",
	"melody",
	False,
	True,
	True,
	0,
	],
	[
	str(ROOT / MANDARIN_EXAMPLE_PROMPT_AUDIO),
	str(ROOT / CANTONESE_EXAMPLE_TARGET_AUDIO),
	str(ROOT / MANDARIN_EXAMPLE_PROMPT_META),
	str(ROOT / CANTONESE_EXAMPLE_TARGET_META),
	"Mandarin",
	"Cantonese",
	"melody",
	False,
	True,
	True,
	0,
	],
	[
	str(ROOT / MANDARIN_EXAMPLE_PROMPT_AUDIO),
	str(ROOT / ENGLISH_EXAMPLE_TARGET_AUDIO),
	str(ROOT / MANDARIN_EXAMPLE_PROMPT_META),
	str(ROOT / ENGLISH_EXAMPLE_TARGET_META),
	"Mandarin",
	"English",
	"melody",
	False,
	True,
	True,
	0,
	],
	[
	str(ROOT / MANDARIN_EXAMPLE_PROMPT_AUDIO),
	str(ROOT / MUSIC_EXAMPLE_TARGET_AUDIO),
	str(ROOT / MANDARIN_EXAMPLE_PROMPT_META),
	str(ROOT / MUSIC_EXAMPLE_TARGET_META),
	"Mandarin",
	"Mandarin",
	"melody",
	False,
	True,
	True,
	0,
	],
	]


	def _load_example(choice_value):
	"""Return 11 example values + skip_clear_count (2 when loading example so next 2 audio.change events don't clear metadata).
	choice_value: selected dropdown string (or index in older flow); map to example index 0/1/2."""
	if choice_value is None:
	return [gr.update()] * 11 + [0]
	idx = 0
	if isinstance(choice_value, int):
	idx = 0 if choice_value <= 0 else min(choice_value - 1, len(EXAMPLES_LIST) - 1)
	else:
	if choice_value == i18n("example_choice_1"):
	idx = 1
	elif choice_value == i18n("example_choice_2"):
	idx = 2
	elif choice_value == i18n("example_choice_3"):
	idx = 3
	elif choice_value == i18n("example_choice_4"):
	idx = 4
	if idx <= 0:
	return [gr.update()] * 11 + [0]
	list_idx = idx - 1
	if list_idx >= len(EXAMPLES_LIST):
	return [gr.update()] * 11 + [0]
	row = EXAMPLES_LIST[list_idx]
	return [
	row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7], row[8], row[9], row[10],
	2, # skip_clear_metadata_count: next 2 audio.change events (prompt + target) will not clear metadata
	]


	def _clear_prompt_meta_unless_example(_audio, skip_count):
	if skip_count and skip_count > 0:
	return gr.skip(), max(0, skip_count - 1)
	return None, 0


	def _clear_target_meta_unless_example(_audio, skip_count):
	if skip_count and skip_count > 0:
	return gr.skip(), max(0, skip_count - 1)
	return None, 0


	def _get_device() -> str:
	"""Use CUDA if available, else CPU (e.g. for CI or CPU-only environments)."""
	return "cuda:0" if torch.cuda.is_available() else "cpu"


	def _session_dir_from_target(target_audio_path: str) -> Path:
	stem = Path(target_audio_path).stem
	safe = re.sub(r"[^\w\-]", "_", stem)
	safe = re.sub(r"_+", "_", safe).strip("_") or "session"
	return ROOT / "outputs" / "gradio" / safe[:64]


	class AppState:
	def __init__(self) -> None:
	self.device = _get_device()
	self.preprocess_pipeline = PreprocessPipeline(
	device=self.device,
	language="Mandarin",
	save_dir=str(ROOT / "outputs" / "gradio" / "_placeholder" / "transcriptions"),
	vocal_sep=True,
	max_merge_duration=60000,
	)
	config = load_config("soulxsinger/config/soulxsinger.yaml")
	self.svs_config = config
	self.svs_model = build_svs_model(
	model_path="pretrained_models/SoulX-Singer/model.pt",
	config=config,
	device=self.device,
	)
	self.phoneset_path = "soulxsinger/utils/phoneme/phone_set.json"

	def run_preprocess(
	self,
	prompt_path: Path,
	target_path: Path,
	session_base: Path,
	prompt_vocal_sep: bool,
	target_vocal_sep: bool,
	prompt_lyric_lang: str,
	target_lyric_lang: str,
	) -> Tuple[bool, str]:
	try:
	self.preprocess_pipeline.save_dir = str(session_base / "transcriptions" / "prompt")
	self.preprocess_pipeline.run(
	audio_path=str(prompt_path),
	vocal_sep=prompt_vocal_sep,
	max_merge_duration=20000,
	language=prompt_lyric_lang or "Mandarin",
	)
	self.preprocess_pipeline.save_dir = str(session_base / "transcriptions" / "target")
	self.preprocess_pipeline.run(
	audio_path=str(target_path),
	vocal_sep=target_vocal_sep,
	max_merge_duration=60000,
	language=target_lyric_lang or "Mandarin",
	)
	return True, "preprocess done"
	except Exception as e:
	return False, f"preprocess failed: {e}"

	def run_svs(
	self,
	control: str,
	session_base: Path,
	auto_shift: bool,
	pitch_shift: int,
	) -> Tuple[bool, str, Path \| None, Path \| None, Path \| None]:
	if control not in ("melody", "score"):
	control = "score"
	save_dir = session_base / "generated"
	save_dir.mkdir(parents=True, exist_ok=True)
	class Args:
	pass
	args = Args()
	args.device = self.device
	args.model_path = "pretrained_models/SoulX-Singer/model.pt"
	args.config = "soulxsinger/config/soulxsinger.yaml"
	args.prompt_wav_path = str(session_base / "audio" / "prompt.wav")
	prompt_meta_path = session_base / "transcriptions" / "prompt" / "metadata.json"
	target_meta_path = session_base / "transcriptions" / "target" / "metadata.json"
	args.prompt_metadata_path = str(prompt_meta_path)
	args.target_metadata_path = str(target_meta_path)
	args.phoneset_path = self.phoneset_path
	args.save_dir = str(save_dir)
	args.auto_shift = auto_shift
	args.pitch_shift = int(pitch_shift)
	args.control = control
	try:
	svs_process(args, self.svs_config, self.svs_model)
	generated = save_dir / "generated.wav"
	if not generated.exists():
	return False, f"inference finished but {generated} not found", None, prompt_meta_path, target_meta_path
	return True, "svs inference done", generated, prompt_meta_path, target_meta_path
	except Exception as e:
	return False, f"svs inference failed: {e}", None, prompt_meta_path, target_meta_path

	def run_svs_from_paths(
	self,
	prompt_wav_path: str,
	prompt_metadata_path: str,
	target_metadata_path: str,
	control: str,
	auto_shift: bool,
	pitch_shift: int,
	save_dir: Path \| None = None,
	) -> Tuple[bool, str, Path \| None]:
	"""Run SVS from explicit prompt wav and metadata paths."""
	if save_dir is None:
	import uuid
	save_dir = ROOT / "outputs" / "gradio" / "synthesis" / str(uuid.uuid4())[:8]
	save_dir = Path(save_dir)
	audio_dir = save_dir / "audio"
	prompt_meta_dir = save_dir / "transcriptions" / "prompt"
	target_meta_dir = save_dir / "transcriptions" / "target"
	audio_dir.mkdir(parents=True, exist_ok=True)
	prompt_meta_dir.mkdir(parents=True, exist_ok=True)
	target_meta_dir.mkdir(parents=True, exist_ok=True)
	shutil.copy2(prompt_wav_path, audio_dir / "prompt.wav")
	shutil.copy2(prompt_metadata_path, prompt_meta_dir / "metadata.json")
	shutil.copy2(target_metadata_path, target_meta_dir / "metadata.json")
	ok, msg, merged, _, _ = self.run_svs(
	control=control,
	session_base=save_dir,
	auto_shift=auto_shift,
	pitch_shift=pitch_shift,
	)
	if not ok or merged is None:
	return False, msg or "svs failed", None
	return True, "svs inference done", merged


	# Lazy initialization for ZeroGPU support
	_app_state = None

	def get_app_state():
	"""Get or create AppState instance. For ZeroGPU, this will be decorated with @spaces.GPU."""
	global _app_state
	if _app_state is None:
	_app_state = AppState()
	return _app_state

	# Decorate get_app_state for ZeroGPU (other functions will be decorated after they are defined)
	if spaces is not None:
	get_app_state = spaces.GPU()(get_app_state)

	# Initialize AppState at module level (for non-ZeroGPU environments)
	# For ZeroGPU, initialization will happen lazily in functions decorated with @spaces.GPU
	if spaces is None:
	APP_STATE = AppState()
	else:
	# For ZeroGPU, use lazy initialization
	APP_STATE = None


	# i18n
	_i18n_key2lang_dict = dict(
	display_lang_label=dict(en="Display Language", zh="显示语言"),
	seed_label=dict(en="Seed", zh="种子"),
	prompt_audio_label=dict(en="Prompt audio (reference voice), limit to 30 seconds", zh="Prompt 音频（参考音色），限制在 30 秒以内"),
	target_audio_label=dict(en="Target audio (melody / lyrics source), limit to 60 seconds", zh="Target 音频（旋律/歌词来源），限制在 60 秒以内"),
	generate_btn_label=dict(en="Start SVS", zh="开始 SVS"),
	transcription_btn_label=dict(en="Run singing transcription", zh="开始歌声转录"),
	synthesis_btn_label=dict(en="Run singing synthesis", zh="歌声合成"),
	prompt_meta_label=dict(en="Prompt metadata", zh="Prompt metadata"),
	target_meta_label=dict(en="Target metadata", zh="Target metadata"),

	edit_tutorial_html=dict(
	en='<p class="mb-0">Refer to <a href="https://github.com/Soul-AILab/SoulX-Singer/tree/main/preprocess#step-2-edit-in-the-midi-editor" target="_blank" rel="noopener">Edit Tutorial</a> for metadata editing (Important Note: The generated metadata may not perfectly align the singing audio with the corresponding lyrics and musical notes. For better results, we strongly recommend manually correcting the alignment. You can directly use <a href="https://huggingface.co/spaces/Soul-AILab/SoulX-Singer-Midi-Editor" target="_blank" rel="noopener">SoulX-Singer-Midi-Editor</a> to edit) </p>',
	zh='<p class="mb-0">metadata 编辑请参考 <a href="https://github.com/Soul-AILab/SoulX-Singer/tree/main/preprocess#step-2-edit-in-the-midi-editor" target="_blank" rel="noopener">编辑教程</a> (重要提示：自动生成的 metadata 在音频与歌词、音高对齐效果通常不理想。为了获得更好的结果，我们强烈建议手动纠正对齐，否则会导致合成效果不佳。你可以直接使用 <a href="https://huggingface.co/spaces/Soul-AILab/SoulX-Singer-Midi-Editor" target="_blank" rel="noopener">SoulX-Singer-Midi-Editor</a> 进行编辑) </p>',
	),
	prompt_wav_label=dict(en="Prompt WAV (reference)", zh="Prompt WAV（参考音色）"),
	generated_audio_label=dict(en="Generated merged audio", zh="合成结果音频"),
	prompt_lyric_lang_label=dict(en="Prompt lyric language", zh="Prompt 歌词语种"),
	target_lyric_lang_label=dict(en="Target lyric language", zh="Target 歌词语种"),
	lyric_lang_mandarin=dict(en="Mandarin", zh="普通话"),
	lyric_lang_cantonese=dict(en="Cantonese", zh="粤语"),
	lyric_lang_english=dict(en="English", zh="英语"),
	warn_missing_synthesis=dict(en="Please provide prompt WAV, prompt metadata, and target metadata", zh="请提供 Prompt WAV、Prompt metadata 与 Target metadata"),
	prompt_vocal_sep_label=dict(en="Prompt vocal separation", zh="Prompt人声分离"),
	target_vocal_sep_label=dict(en="Target vocal separation", zh="Target人声分离"),
	auto_shift_label=dict(en="Auto pitch shift", zh="自动变调"),
	pitch_shift_label=dict(en="Pitch shift (semitones)", zh="指定变调（半音）"),
	control_type_label=dict(en="Control type", zh="控制类型"),
	examples_label=dict(en="Reference examples (click to load)", zh="参考样例（点击加载）"),
	example_choice_0=dict(en="—", zh="—"),
	example_choice_1=dict(en="Example 1: Mandarin → Mandarin (melody), Start singing synthesis!", zh="样例 1: 普通话 → 普通话 (melody), 开始歌声合成吧!"),
	example_choice_2=dict(en="Example 2: Mandarin → Cantonese (melody), Start singing synthesis!", zh="样例 2: 普通话 → 粤语 (melody), 开始歌声合成吧!"),
	example_choice_3=dict(en="Example 3: Mandarin → English (melody), Start singing synthesis!", zh="样例 3: 普通话 → 英语 (melody), 开始歌声合成吧!"),
	example_choice_4=dict(en="Example 4: Mandarin → Music (score), Start singing synthesis!", zh="样例 4: 普通话 → 音乐 (score), 开始歌声合成吧!"),
	warn_missing_audio=dict(
	en="Please upload both prompt audio and target audio",
	zh="请上传 Prompt 音频与 Target 音频",
	),
	# Instruction panel (workflow description)
	instruction_title=dict(en="Usage", zh="使用说明"),
	instruction_p1=dict(
	en="After uploading prompt and target audio and clicking Run singing transcription, the system generates two metadata files (prompt and target).",
	zh="上传 Prompt 与 Target 音频并点击「开始歌声转录」后，将生成 Prompt 与 Target 两份 metadata 文件。",
	),
	instruction_p2=dict(
	en="Auto-transcribed lyrics and notes are often misaligned. For better results, import the generated metadata into the MIDI Editor for manual adjustment: [SoulX-Singer-Midi-Editor](https://huggingface.co/spaces/Soul-AILab/SoulX-Singer-Midi-Editor).",
	zh="自动转录的歌词与音高对齐效果通常不理想，建议将生成的 metadata 导入 MIDI 编辑器进行手动调整：[SoulX-Singer-Midi-Editor](https://huggingface.co/spaces/Soul-AILab/SoulX-Singer-Midi-Editor)。",
	),
	instruction_p3=dict(
	en="Re-upload the adjusted metadata to the corresponding Prompt / Target Meta fields, then click Run singing synthesis to generate the final audio.",
	zh="将调整后的 metadata 重新上传至对应的 Prompt / Target Meta 位置后，点击「歌声合成」开始最终生成。",
	),
	)

	def _detect_initial_lang() -> Literal["zh", "en"]:
	"""Detect initial UI language from server locale (browser language applied later via JS)."""
	try:
	import locale
	loc = (locale.getdefaultlocale()[0] or os.environ.get("LANG", "") or "").lower()
	return "en" if loc.startswith("en") else "zh"
	except Exception:
	return "zh"


	global_lang: Literal["zh", "en"] = _detect_initial_lang()


	def i18n(key: str) -> str:
	return _i18n_key2lang_dict[key][global_lang]


	def get_lyric_lang_choices():
	"""Lyric language dropdown (display, value) for current UI language."""
	return [
	(i18n("lyric_lang_mandarin"), "Mandarin"),
	(i18n("lyric_lang_cantonese"), "Cantonese"),
	(i18n("lyric_lang_english"), "English"),
	]


	def _resolve_file_path(x):
	"""Gradio file input can be path string or (path, None) tuple."""
	if x is None:
	return None
	if isinstance(x, tuple):
	x = x[0]
	return x if (x and os.path.isfile(x)) else None


	def transcription_function(
	prompt_audio,
	target_audio,
	prompt_metadata,
	target_metadata,
	prompt_lyric_lang: str,
	target_lyric_lang: str,
	prompt_vocal_sep: bool,
	target_vocal_sep: bool,
	):
	"""Step 1: Run transcription only; output (prompt_meta_path, target_meta_path)."""
	global APP_STATE
	if APP_STATE is None:
	APP_STATE = get_app_state()
	try:
	if isinstance(prompt_audio, tuple):
	prompt_audio = prompt_audio[0]
	if isinstance(target_audio, tuple):
	target_audio = target_audio[0]
	if prompt_audio is None or target_audio is None:
	gr.Warning(message=i18n("warn_missing_audio"))
	return None, None
	prompt_meta_resolved = _resolve_file_path(prompt_metadata)
	target_meta_resolved = _resolve_file_path(target_metadata)
	use_input_metadata = prompt_meta_resolved is not None and target_meta_resolved is not None

	session_base = _session_dir_from_target(target_audio)
	audio_dir = session_base / "audio"
	audio_dir.mkdir(parents=True, exist_ok=True)
	transfer_prompt_path = audio_dir / "prompt.wav"
	transfer_target_path = audio_dir / "target.wav"
	SR = 44100
	PROMPT_MAX_SEC = 30
	TARGET_MAX_SEC = 60
	prompt_audio_data, _ = librosa.load(prompt_audio, sr=SR, mono=True)
	target_audio_data, _ = librosa.load(target_audio, sr=SR, mono=True)
	prompt_audio_data = prompt_audio_data[: PROMPT_MAX_SEC * SR]
	target_audio_data = target_audio_data[: TARGET_MAX_SEC * SR]
	sf.write(transfer_prompt_path, prompt_audio_data, SR)
	sf.write(transfer_target_path, target_audio_data, SR)

	prompt_meta_path = session_base / "transcriptions" / "prompt" / "metadata.json"
	target_meta_path = session_base / "transcriptions" / "target" / "metadata.json"
	if use_input_metadata:
	(session_base / "transcriptions" / "prompt").mkdir(parents=True, exist_ok=True)
	(session_base / "transcriptions" / "target").mkdir(parents=True, exist_ok=True)
	shutil.copy2(prompt_meta_resolved, prompt_meta_path)
	shutil.copy2(target_meta_resolved, target_meta_path)
	else:
	ok, msg = APP_STATE.run_preprocess(
	transfer_prompt_path,
	transfer_target_path,
	session_base,
	prompt_vocal_sep=prompt_vocal_sep,
	target_vocal_sep=target_vocal_sep,
	prompt_lyric_lang=prompt_lyric_lang or "Mandarin",
	target_lyric_lang=target_lyric_lang or "Mandarin",
	)
	if not ok:
	print(msg, file=sys.stderr, flush=True)
	return None, None

	prompt_meta_file = str(prompt_meta_path) if prompt_meta_path.exists() else None
	target_meta_file = str(target_meta_path) if target_meta_path.exists() else None
	return prompt_meta_file, target_meta_file
	except Exception:
	print(traceback.format_exc(), file=sys.stderr, flush=True)
	return None, None


	def synthesis_function(
	prompt_audio,
	prompt_metadata,
	target_metadata,
	control: str,
	auto_shift: bool,
	pitch_shift,
	seed: int,
	):
	"""Step 2: Run SVS from top prompt_audio + prompt_metadata + target_metadata."""
	global APP_STATE
	if APP_STATE is None:
	APP_STATE = get_app_state()
	try:
	if isinstance(prompt_audio, tuple):
	prompt_audio = prompt_audio[0]
	prompt_wav_path = prompt_audio
	prompt_meta_path = _resolve_file_path(prompt_metadata)
	target_meta_path = _resolve_file_path(target_metadata)
	if not prompt_wav_path or not os.path.isfile(prompt_wav_path):
	gr.Warning(message=i18n("warn_missing_synthesis"))
	return None
	if not prompt_meta_path or not os.path.isfile(prompt_meta_path):
	gr.Warning(message=i18n("warn_missing_synthesis"))
	return None
	if not target_meta_path or not os.path.isfile(target_meta_path):
	gr.Warning(message=i18n("warn_missing_synthesis"))
	return None
	if control not in ("melody", "score"):
	control = "score"
	seed = int(seed)
	torch.manual_seed(seed)
	np.random.seed(seed)
	random.seed(seed)
	ok, msg, merged = APP_STATE.run_svs_from_paths(
	prompt_wav_path=prompt_wav_path,
	prompt_metadata_path=prompt_meta_path,
	target_metadata_path=target_meta_path,
	control=control,
	auto_shift=auto_shift,
	pitch_shift=int(pitch_shift),
	)
	if not ok or merged is None:
	print(msg or "synthesis failed", file=sys.stderr, flush=True)
	return None
	return str(merged)
	except Exception:
	print(traceback.format_exc(), file=sys.stderr, flush=True)
	return None


	# Decorate GPU-using functions for ZeroGPU (after they are defined)
	if spaces is not None:
	transcription_function = spaces.GPU()(transcription_function)
	synthesis_function = spaces.GPU()(synthesis_function)


	def _instruction_md() -> str:
	"""Markdown content for the instruction panel (supports links)."""
	return "\n\n".join([
	f"1. {i18n('instruction_p1')}",
	f"2. {i18n('instruction_p2')}",
	f"3. {i18n('instruction_p3')}",
	])


	def render_interface() -> gr.Blocks:
	with gr.Blocks(title="SoulX-Singer 歌声合成Demo", theme=gr.themes.Default()) as page:
	gr.HTML(
	'<div style="'
	'text-align: center; '
	'padding: 1.25rem 0 1.5rem; '
	'margin-bottom: 0.5rem;'
	'">'
	'<div style="'
	'display: inline-block; '
	'font-size: 1.75rem; '
	'font-weight: 700; '
	'letter-spacing: 0.02em; '
	'color: #1a1a2e; '
	'line-height: 1.3;'
	'">SoulX-Singer</div>'
	'<div style="'
	'width: 80px; '
	'height: 3px; '
	'margin: 1rem auto 0; '
	'background: linear-gradient(90deg, transparent, #6366f1, transparent); '
	'border-radius: 2px;'
	'"></div>'
	'</div>'
	)
	# Auto-detect browser language: run after Gradio mounts
	gr.HTML(
	'<script type="text/javascript">'
	'(function(){'
	'function setLang(){'
	'var lang=(navigator.language\|\|navigator.userLanguage\|\|"").toLowerCase();'
	'if(lang.startsWith("en")){'
	'var inputs=document.querySelectorAll("#lang_choice_radio input");'
	'if(inputs.length>1)inputs[1].click();'
	'}'
	'}'
	'if(document.readyState==="complete")setTimeout(setLang,800);'
	'else window.addEventListener("load",function(){setTimeout(setLang,800);});'
	'})();'
	'</script>',
	visible=False,
	)
	with gr.Row(equal_height=True):
	lang_choice = gr.Radio(
	choices=["中文", "English"],
	value="中文",
	label=i18n("display_lang_label"),
	type="index",
	interactive=True,
	elem_id="lang_choice_radio",
	)

	# Instruction panel (usage workflow); updates on language change
	instruction_md = gr.Markdown(f"### {i18n('instruction_title')}\n\n{_instruction_md()}")

	# Reference examples — at the front of operations (handler registered after components exist)
	skip_clear_metadata_count = gr.State(0)
	with gr.Row():
	_example_choices = [i18n("example_choice_0"), i18n("example_choice_1"), i18n("example_choice_2"), i18n("example_choice_3"), i18n("example_choice_4")]
	example_choice = gr.Dropdown(
	label=i18n("examples_label"),
	choices=_example_choices,
	value=_example_choices[0],
	interactive=True,
	)

	# Step 1: Transcription (audio → metadata)
	with gr.Row(equal_height=True):
	with gr.Column(scale=1):
	prompt_audio = gr.Audio(
	label=i18n("prompt_audio_label"),
	type="filepath",
	editable=False,
	interactive=True,
	)
	with gr.Column(scale=1):
	target_audio = gr.Audio(
	label=i18n("target_audio_label"),
	type="filepath",
	editable=False,
	interactive=True,
	)
	with gr.Row(equal_height=True):
	prompt_lyric_lang = gr.Dropdown(
	label=i18n("prompt_lyric_lang_label"),
	choices=get_lyric_lang_choices(),
	value="Mandarin",
	interactive=True,
	scale=1,
	)
	target_lyric_lang = gr.Dropdown(
	label=i18n("target_lyric_lang_label"),
	choices=get_lyric_lang_choices(),
	value="Mandarin",
	interactive=True,
	scale=1,
	)
	prompt_vocal_sep = gr.Checkbox(
	label=i18n("prompt_vocal_sep_label"),
	value=False,
	interactive=True,
	scale=1,
	)
	target_vocal_sep = gr.Checkbox(
	label=i18n("target_vocal_sep_label"),
	value=True,
	interactive=True,
	scale=1,
	)
	with gr.Row():
	transcription_btn = gr.Button(
	value=i18n("transcription_btn_label"),
	variant="primary",
	size="lg",
	)

	# Edit tutorial link (gr.HTML supports links; component labels do not)
	metadata_tutorial_html = gr.HTML(value=i18n("edit_tutorial_html"))
	# Synthesis: params row, then synthesis button on next row
	with gr.Row(equal_height=True):
	prompt_metadata = gr.File(
	label=i18n("prompt_meta_label"),
	type="filepath",
	file_types=[".json"],
	interactive=True,
	)
	target_metadata = gr.File(
	label=i18n("target_meta_label"),
	type="filepath",
	file_types=[".json"],
	interactive=True,
	)
	control_radio = gr.Radio(
	choices=["melody", "score"],
	value="score",
	label=i18n("control_type_label"),
	scale=1,
	)
	auto_shift = gr.Checkbox(
	label=i18n("auto_shift_label"),
	value=True,
	interactive=True,
	scale=1,
	)
	pitch_shift = gr.Number(
	label=i18n("pitch_shift_label"),
	value=0,
	minimum=-36,
	maximum=36,
	step=1,
	interactive=True,
	scale=1,
	)
	seed_input = gr.Number(
	label=i18n("seed_label"),
	value=12306,
	step=1,
	interactive=True,
	scale=1,
	)
	with gr.Row():
	synthesis_btn = gr.Button(
	value=i18n("synthesis_btn_label"),
	variant="primary",
	size="lg",
	)
	with gr.Row():
	output_audio = gr.Audio(
	label=i18n("generated_audio_label"),
	type="filepath",
	interactive=False,
	)

	example_choice.change(
	fn=_load_example,
	inputs=[example_choice],
	outputs=[
	prompt_audio,
	target_audio,
	prompt_metadata,
	target_metadata,
	prompt_lyric_lang,
	target_lyric_lang,
	control_radio,
	prompt_vocal_sep,
	target_vocal_sep,
	auto_shift,
	pitch_shift,
	skip_clear_metadata_count,
	],
	)

	def _change_component_language(lang):
	global global_lang
	global_lang = ["zh", "en"][lang]
	choices = get_lyric_lang_choices()
	return [
	gr.update(label=i18n("prompt_audio_label")),
	gr.update(label=i18n("target_audio_label")),
	gr.update(label=i18n("prompt_lyric_lang_label"), choices=choices),
	gr.update(label=i18n("target_lyric_lang_label"), choices=choices),
	gr.update(label=i18n("prompt_vocal_sep_label")),
	gr.update(label=i18n("target_vocal_sep_label")),
	gr.update(value=i18n("transcription_btn_label")),
	gr.update(label=i18n("prompt_meta_label")),
	gr.update(label=i18n("target_meta_label")),
	gr.update(value=i18n("edit_tutorial_html")),
	gr.update(label=i18n("control_type_label")),
	gr.update(label=i18n("auto_shift_label")),
	gr.update(label=i18n("pitch_shift_label")),
	gr.update(label=i18n("seed_label")),
	gr.update(value=i18n("synthesis_btn_label")),
	gr.update(label=i18n("generated_audio_label")),
	gr.update(label=i18n("display_lang_label")),
	gr.update(
	label=i18n("examples_label"),
	choices=[i18n("example_choice_0"), i18n("example_choice_1"), i18n("example_choice_2"), i18n("example_choice_3"), i18n("example_choice_4")],
	value=i18n("example_choice_0"),
	),
	gr.update(value=f"### {i18n('instruction_title')}\n\n{_instruction_md()}"),
	]

	lang_choice.change(
	fn=_change_component_language,
	inputs=[lang_choice],
	outputs=[
	prompt_audio,
	target_audio,
	prompt_lyric_lang,
	target_lyric_lang,
	prompt_vocal_sep,
	target_vocal_sep,
	transcription_btn,
	prompt_metadata,
	target_metadata,
	metadata_tutorial_html,
	control_radio,
	auto_shift,
	pitch_shift,
	seed_input,
	synthesis_btn,
	output_audio,
	lang_choice,
	example_choice,
	instruction_md,
	],
	)

	# Upload new prompt/target audio → clear corresponding metadata; skip clear when change came from load example
	prompt_audio.change(
	fn=_clear_prompt_meta_unless_example,
	inputs=[prompt_audio, skip_clear_metadata_count],
	outputs=[prompt_metadata, skip_clear_metadata_count],
	)
	target_audio.change(
	fn=_clear_target_meta_unless_example,
	inputs=[target_audio, skip_clear_metadata_count],
	outputs=[target_metadata, skip_clear_metadata_count],
	)

	transcription_btn.click(
	fn=transcription_function,
	inputs=[
	prompt_audio,
	target_audio,
	prompt_metadata,
	target_metadata,
	prompt_lyric_lang,
	target_lyric_lang,
	prompt_vocal_sep,
	target_vocal_sep,
	],
	outputs=[prompt_metadata, target_metadata],
	)

	synthesis_btn.click(
	fn=synthesis_function,
	inputs=[
	prompt_audio,
	prompt_metadata,
	target_metadata,
	control_radio,
	auto_shift,
	pitch_shift,
	seed_input,
	],
	outputs=[output_audio],
	)

	return page


	if __name__ == "__main__":
	import argparse
	parser = argparse.ArgumentParser()
	parser.add_argument("--port", type=int, default=7860, help="Gradio server port")
	parser.add_argument("--share", action="store_true", help="Create public link")
	args = parser.parse_args()

	page = render_interface()
	page.queue()
	page.launch(share=args.share, server_name="0.0.0.0", server_port=args.port)