multimodalart HF Staff commited on
Commit
1e61d28
·
verified ·
1 Parent(s): 4a3004b

Update webui.py

Browse files
Files changed (1) hide show
  1. webui.py +251 -516
webui.py CHANGED
@@ -5,7 +5,7 @@ import shutil
5
  import sys
6
  import traceback
7
  from pathlib import Path
8
- from typing import Literal, Tuple
9
  import spaces
10
 
11
  import numpy as np
@@ -21,131 +21,8 @@ from cli.inference import build_model as build_svs_model, process as svs_process
21
 
22
  ROOT = Path(__file__).parent
23
 
24
- ENGLISH_EXAMPLE_PROMPT_AUDIO = "example/audio/en_prompt.mp3"
25
- ENGLISH_EXAMPLE_PROMPT_META = "example/audio/en_prompt.json"
26
- ENGLISH_EXAMPLE_TARGET_AUDIO = "example/audio/en_target.mp3"
27
- ENGLISH_EXAMPLE_TARGET_META = "example/audio/en_target.json"
28
-
29
- MANDARIN_EXAMPLE_PROMPT_AUDIO = "example/audio/zh_prompt.mp3"
30
- MANDARIN_EXAMPLE_PROMPT_META = "example/audio/zh_prompt.json"
31
- MANDARIN_EXAMPLE_TARGET_AUDIO = "example/audio/zh_target.mp3"
32
- MANDARIN_EXAMPLE_TARGET_META = "example/audio/zh_target.json"
33
-
34
- CANTONESE_EXAMPLE_PROMPT_AUDIO = "example/audio/yue_prompt.mp3"
35
- CANTONESE_EXAMPLE_PROMPT_META = "example/audio/yue_prompt.json"
36
- CANTONESE_EXAMPLE_TARGET_AUDIO = "example/audio/yue_target.mp3"
37
- CANTONESE_EXAMPLE_TARGET_META = "example/audio/yue_target.json"
38
-
39
- MUSIC_EXAMPLE_TARGET_AUDIO = "example/audio/music.mp3"
40
- MUSIC_EXAMPLE_TARGET_META = "example/audio/music.json"
41
-
42
- # Lyric language: value (Mandarin/Cantonese/English) is passed to PreprocessPipeline; display labels from i18n via get_lyric_lang_choices()
43
-
44
- # Use absolute paths so Examples load correctly (including File components for metadata)
45
- EXAMPLES_LIST = [
46
- [
47
- str(ROOT / MANDARIN_EXAMPLE_PROMPT_AUDIO),
48
- str(ROOT / MANDARIN_EXAMPLE_TARGET_AUDIO),
49
- str(ROOT / MANDARIN_EXAMPLE_PROMPT_META),
50
- str(ROOT / MANDARIN_EXAMPLE_TARGET_META),
51
- "Mandarin",
52
- "Mandarin",
53
- "melody",
54
- False,
55
- True,
56
- True,
57
- 0,
58
- ],
59
- [
60
- str(ROOT / MANDARIN_EXAMPLE_PROMPT_AUDIO),
61
- str(ROOT / CANTONESE_EXAMPLE_TARGET_AUDIO),
62
- str(ROOT / MANDARIN_EXAMPLE_PROMPT_META),
63
- str(ROOT / CANTONESE_EXAMPLE_TARGET_META),
64
- "Mandarin",
65
- "Cantonese",
66
- "melody",
67
- False,
68
- True,
69
- True,
70
- 0,
71
- ],
72
- [
73
- str(ROOT / MANDARIN_EXAMPLE_PROMPT_AUDIO),
74
- str(ROOT / ENGLISH_EXAMPLE_TARGET_AUDIO),
75
- str(ROOT / MANDARIN_EXAMPLE_PROMPT_META),
76
- str(ROOT / ENGLISH_EXAMPLE_TARGET_META),
77
- "Mandarin",
78
- "English",
79
- "melody",
80
- False,
81
- True,
82
- True,
83
- 0,
84
- ],
85
- [
86
- str(ROOT / MANDARIN_EXAMPLE_PROMPT_AUDIO),
87
- str(ROOT / MUSIC_EXAMPLE_TARGET_AUDIO),
88
- str(ROOT / MANDARIN_EXAMPLE_PROMPT_META),
89
- str(ROOT / MUSIC_EXAMPLE_TARGET_META),
90
- "Mandarin",
91
- "Mandarin",
92
- "melody",
93
- False,
94
- True,
95
- True,
96
- 0,
97
- ],
98
- ]
99
-
100
-
101
- def _load_example(choice_value):
102
- """Return 11 example values + skip_clear_count (2 when loading example so next 2 audio.change events don't clear metadata).
103
- choice_value: selected dropdown string (or index in older flow); map to example index 0/1/2."""
104
- if choice_value is None:
105
- return [gr.update()] * 11 + [0]
106
- idx = 0
107
- if isinstance(choice_value, int):
108
- idx = 0 if choice_value <= 0 else min(choice_value - 1, len(EXAMPLES_LIST) - 1)
109
- else:
110
- if choice_value == i18n("example_choice_1"):
111
- idx = 1
112
- elif choice_value == i18n("example_choice_2"):
113
- idx = 2
114
- elif choice_value == i18n("example_choice_3"):
115
- idx = 3
116
- elif choice_value == i18n("example_choice_4"):
117
- idx = 4
118
- if idx <= 0:
119
- return [gr.update()] * 11 + [0]
120
- list_idx = idx - 1
121
- if list_idx >= len(EXAMPLES_LIST):
122
- return [gr.update()] * 11 + [0]
123
- row = EXAMPLES_LIST[list_idx]
124
- return [
125
- row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7], row[8], row[9], row[10],
126
- 2, # skip_clear_metadata_count: next 2 audio.change events (prompt + target) will not clear metadata
127
- ]
128
-
129
-
130
- def _clear_prompt_meta_unless_example(_audio, skip_count):
131
- if skip_count and skip_count > 0:
132
- return gr.skip(), max(0, skip_count - 1)
133
- return None, 0
134
-
135
-
136
- def _clear_target_meta_unless_example(_audio, skip_count):
137
- if skip_count and skip_count > 0:
138
- return gr.skip(), max(0, skip_count - 1)
139
- return None, 0
140
-
141
 
142
  def _get_device() -> str:
143
- """Use CUDA if available, else CPU (e.g. for CI or CPU-only environments).
144
-
145
- On ZeroGPU Spaces the real GPU is only attached inside @spaces.GPU, so
146
- torch.cuda.is_available() returns False at module-load time. We still
147
- want models to target CUDA so they run on the GPU once it is attached.
148
- """
149
  if torch.cuda.is_available():
150
  return "cuda:0"
151
  try:
@@ -169,7 +46,7 @@ class AppState:
169
  self.device = _get_device()
170
  self.preprocess_pipeline = PreprocessPipeline(
171
  device=self.device,
172
- language="Mandarin",
173
  save_dir=str(ROOT / "outputs" / "gradio" / "_placeholder" / "transcriptions"),
174
  vocal_sep=True,
175
  max_merge_duration=60000,
@@ -199,14 +76,14 @@ class AppState:
199
  audio_path=str(prompt_path),
200
  vocal_sep=prompt_vocal_sep,
201
  max_merge_duration=20000,
202
- language=prompt_lyric_lang or "Mandarin",
203
  )
204
  self.preprocess_pipeline.save_dir = str(session_base / "transcriptions" / "target")
205
  self.preprocess_pipeline.run(
206
  audio_path=str(target_path),
207
  vocal_sep=target_vocal_sep,
208
  max_merge_duration=60000,
209
- language=target_lyric_lang or "Mandarin",
210
  )
211
  return True, "preprocess done"
212
  except Exception as e:
@@ -223,8 +100,10 @@ class AppState:
223
  control = "score"
224
  save_dir = session_base / "generated"
225
  save_dir.mkdir(parents=True, exist_ok=True)
 
226
  class Args:
227
  pass
 
228
  args = Args()
229
  args.device = self.device
230
  args.model_path = "pretrained_models/SoulX-Singer/model.pt"
@@ -258,7 +137,6 @@ class AppState:
258
  pitch_shift: int,
259
  save_dir: Path | None = None,
260
  ) -> Tuple[bool, str, Path | None]:
261
- """Run SVS from explicit prompt wav and metadata paths."""
262
  if save_dir is None:
263
  import uuid
264
  save_dir = ROOT / "outputs" / "gradio" / "synthesis" / str(uuid.uuid4())[:8]
@@ -289,105 +167,65 @@ ensure_pretrained_models()
289
  APP_STATE = AppState()
290
 
291
 
292
- # i18n
293
- _i18n_key2lang_dict = dict(
294
- display_lang_label=dict(en="Display Language", zh="显示语言"),
295
- seed_label=dict(en="Seed", zh="种子"),
296
- prompt_audio_label=dict(en="Prompt audio (reference voice), limit to 30 seconds", zh="Prompt 音频(参考音色),限制在 30 秒以内"),
297
- target_audio_label=dict(en="Target audio (melody / lyrics source), limit to 60 seconds", zh="Target 音频(旋律/歌词来源),限制在 60 秒以内"),
298
- generate_btn_label=dict(en="Start SVS", zh="开始 SVS"),
299
- transcription_btn_label=dict(en="Run singing transcription", zh="开始歌声转录"),
300
- synthesis_btn_label=dict(en="Run singing synthesis", zh="歌声合成"),
301
- prompt_meta_label=dict(en="Prompt metadata", zh="Prompt metadata"),
302
- target_meta_label=dict(en="Target metadata", zh="Target metadata"),
303
-
304
- edit_tutorial_html=dict(
305
- en='<p class="mb-0">Refer to <a href="https://github.com/Soul-AILab/SoulX-Singer/tree/main/preprocess#step-2-edit-in-the-midi-editor" target="_blank" rel="noopener">Edit Tutorial</a> for metadata editing (Important Note: The generated metadata may not perfectly align the singing audio with the corresponding lyrics and musical notes. For better results, we strongly recommend manually correcting the alignment. You can directly use <a href="https://huggingface.co/spaces/Soul-AILab/SoulX-Singer-Midi-Editor" target="_blank" rel="noopener">SoulX-Singer-Midi-Editor</a> to edit) </p>',
306
- zh='<p class="mb-0">metadata 编辑请参考 <a href="https://github.com/Soul-AILab/SoulX-Singer/tree/main/preprocess#step-2-edit-in-the-midi-editor" target="_blank" rel="noopener">编辑教程</a> (重要提示:自动生成的 metadata 在音频与歌词、音高对齐效果通常不理想。为了获得更好的结果,我们强烈建议手动纠正对齐,否则会导致合成效果不佳。 你可以直接使用 <a href="https://huggingface.co/spaces/Soul-AILab/SoulX-Singer-Midi-Editor" target="_blank" rel="noopener">SoulX-Singer-Midi-Editor</a> 进行编辑) </p>',
307
- ),
308
- prompt_wav_label=dict(en="Prompt WAV (reference)", zh="Prompt WAV(参考音色)"),
309
- generated_audio_label=dict(en="Generated merged audio", zh="合成结果音频"),
310
- prompt_lyric_lang_label=dict(en="Prompt lyric language", zh="Prompt 歌词语种"),
311
- target_lyric_lang_label=dict(en="Target lyric language", zh="Target 歌词语种"),
312
- lyric_lang_mandarin=dict(en="Mandarin", zh="普通话"),
313
- lyric_lang_cantonese=dict(en="Cantonese", zh="粤语"),
314
- lyric_lang_english=dict(en="English", zh="英语"),
315
- warn_missing_synthesis=dict(en="Please provide prompt WAV, prompt metadata, and target metadata", zh="请提供 Prompt WAV、Prompt metadata 与 Target metadata"),
316
- prompt_vocal_sep_label=dict(en="Prompt vocal separation", zh="Prompt人声分离"),
317
- target_vocal_sep_label=dict(en="Target vocal separation", zh="Target人声分离"),
318
- auto_shift_label=dict(en="Auto pitch shift", zh="自动变调"),
319
- pitch_shift_label=dict(en="Pitch shift (semitones)", zh="指定变调(半音)"),
320
- control_type_label=dict(en="Control type", zh="控制类型"),
321
- examples_label=dict(en="Reference examples (click to load)", zh="参考样例(点击加载)"),
322
- example_choice_0=dict(en="—", zh="—"),
323
- example_choice_1=dict(en="Example 1: Mandarin → Mandarin (melody), Start singing synthesis!", zh="样例 1: 普通话 → 普通话 (melody), 开���歌声合成吧!"),
324
- example_choice_2=dict(en="Example 2: Mandarin → Cantonese (melody), Start singing synthesis!", zh="样例 2: 普通话 → 粤语 (melody), 开始歌声合成吧!"),
325
- example_choice_3=dict(en="Example 3: Mandarin → English (melody), Start singing synthesis!", zh="样例 3: 普通话 → 英语 (melody), 开始歌声合成吧!"),
326
- example_choice_4=dict(en="Example 4: Mandarin → Music (score), Start singing synthesis!", zh="样例 4: 普通话 → 音乐 (score), 开始歌声合成吧!"),
327
- warn_missing_audio=dict(
328
- en="Please upload both prompt audio and target audio",
329
- zh="请上传 Prompt 音频与 Target 音频",
330
- ),
331
- # Instruction panel (workflow description)
332
- instruction_title=dict(en="Usage", zh="使用说明"),
333
- instruction_p1=dict(
334
- en="After uploading prompt and target audio and clicking **Run singing transcription**, the system generates two metadata files (prompt and target).",
335
- zh="上传 Prompt 与 Target 音频并点击「开始歌声转录」后,将生成 Prompt 与 Target 两份 metadata 文件。",
336
- ),
337
- instruction_p2=dict(
338
- en="Auto-transcribed lyrics and notes are often misaligned. For better results, import the generated metadata into the **MIDI Editor** for manual adjustment: [SoulX-Singer-Midi-Editor](https://huggingface.co/spaces/Soul-AILab/SoulX-Singer-Midi-Editor).",
339
- zh="自动转录的歌词与音高对齐效果通常不理想,建议将生成的 metadata 导入 **MIDI 编辑器** 进行手动调整:[SoulX-Singer-Midi-Editor](https://huggingface.co/spaces/Soul-AILab/SoulX-Singer-Midi-Editor)。",
340
- ),
341
- instruction_p3=dict(
342
- en="Re-upload the adjusted metadata to the corresponding Prompt / Target Meta fields, then click **Run singing synthesis** to generate the final audio.",
343
- zh="将调整后的 metadata 重新上传至对应的 Prompt / Target Meta 位置后,点击「歌声合成」开始最终生成。",
344
- ),
345
- )
346
-
347
- def _detect_initial_lang() -> Literal["zh", "en"]:
348
- """Detect initial UI language from server locale (browser language applied later via JS)."""
349
- try:
350
- import locale
351
- loc = (locale.getdefaultlocale()[0] or os.environ.get("LANG", "") or "").lower()
352
- return "en" if loc.startswith("en") else "zh"
353
- except Exception:
354
- return "zh"
355
-
356
-
357
- global_lang: Literal["zh", "en"] = _detect_initial_lang()
358
-
359
-
360
- def i18n(key: str) -> str:
361
- return _i18n_key2lang_dict[key][global_lang]
362
-
363
-
364
- def get_lyric_lang_choices():
365
- """Lyric language dropdown (display, value) for current UI language."""
366
- return [
367
- (i18n("lyric_lang_mandarin"), "Mandarin"),
368
- (i18n("lyric_lang_cantonese"), "Cantonese"),
369
- (i18n("lyric_lang_english"), "English"),
370
- ]
371
-
372
-
373
  def _resolve_file_path(x):
374
- """Gradio file input can be path string or (path, None) tuple."""
375
  if x is None:
376
  return None
377
  if isinstance(x, tuple):
378
  x = x[0]
379
  return x if (x and os.path.isfile(x)) else None
380
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
381
  @spaces.GPU
382
  def transcription_function(
383
- prompt_audio,
384
- target_audio,
385
- prompt_metadata,
386
- target_metadata,
387
- prompt_lyric_lang: str,
388
- target_lyric_lang: str,
389
- prompt_vocal_sep: bool,
390
- target_vocal_sep: bool,
391
  ):
392
  """Step 1: Run transcription only; output (prompt_meta_path, target_meta_path)."""
393
  try:
@@ -396,89 +234,108 @@ def transcription_function(
396
  if isinstance(target_audio, tuple):
397
  target_audio = target_audio[0]
398
  if prompt_audio is None or target_audio is None:
399
- gr.Warning(message=i18n("warn_missing_audio"))
400
  return None, None
 
401
  prompt_meta_resolved = _resolve_file_path(prompt_metadata)
402
  target_meta_resolved = _resolve_file_path(target_metadata)
403
  use_input_metadata = prompt_meta_resolved is not None and target_meta_resolved is not None
404
 
405
- session_base = _session_dir_from_target(target_audio)
406
- audio_dir = session_base / "audio"
407
- audio_dir.mkdir(parents=True, exist_ok=True)
408
- transfer_prompt_path = audio_dir / "prompt.wav"
409
- transfer_target_path = audio_dir / "target.wav"
410
- SR = 44100
411
- PROMPT_MAX_SEC = 30
412
- TARGET_MAX_SEC = 60
413
- prompt_audio_data, _ = librosa.load(prompt_audio, sr=SR, mono=True)
414
- target_audio_data, _ = librosa.load(target_audio, sr=SR, mono=True)
415
- prompt_audio_data = prompt_audio_data[: PROMPT_MAX_SEC * SR]
416
- target_audio_data = target_audio_data[: TARGET_MAX_SEC * SR]
417
- sf.write(transfer_prompt_path, prompt_audio_data, SR)
418
- sf.write(transfer_target_path, target_audio_data, SR)
419
-
420
- prompt_meta_path = session_base / "transcriptions" / "prompt" / "metadata.json"
421
- target_meta_path = session_base / "transcriptions" / "target" / "metadata.json"
422
  if use_input_metadata:
 
 
 
 
 
 
 
 
 
 
 
 
 
423
  (session_base / "transcriptions" / "prompt").mkdir(parents=True, exist_ok=True)
424
  (session_base / "transcriptions" / "target").mkdir(parents=True, exist_ok=True)
425
  shutil.copy2(prompt_meta_resolved, prompt_meta_path)
426
  shutil.copy2(target_meta_resolved, target_meta_path)
 
427
  else:
428
- ok, msg = APP_STATE.run_preprocess(
429
- transfer_prompt_path,
430
- transfer_target_path,
431
- session_base,
432
- prompt_vocal_sep=prompt_vocal_sep,
433
- target_vocal_sep=target_vocal_sep,
434
- prompt_lyric_lang=prompt_lyric_lang or "Mandarin",
435
- target_lyric_lang=target_lyric_lang or "Mandarin",
436
  )
437
- if not ok:
438
- print(msg, file=sys.stderr, flush=True)
439
- return None, None
440
-
441
- prompt_meta_file = str(prompt_meta_path) if prompt_meta_path.exists() else None
442
- target_meta_file = str(target_meta_path) if target_meta_path.exists() else None
443
- return prompt_meta_file, target_meta_file
444
  except Exception:
445
  print(traceback.format_exc(), file=sys.stderr, flush=True)
446
  return None, None
447
 
 
448
  @spaces.GPU
449
  def synthesis_function(
450
  prompt_audio,
 
451
  prompt_metadata,
452
  target_metadata,
453
- control: str,
454
- auto_shift: bool,
455
  pitch_shift,
456
- seed: int,
 
 
 
 
457
  ):
458
- """Step 2: Run SVS from top prompt_audio + prompt_metadata + target_metadata."""
459
  try:
460
  if isinstance(prompt_audio, tuple):
461
  prompt_audio = prompt_audio[0]
462
- prompt_wav_path = prompt_audio
 
 
 
 
 
 
 
 
 
463
  prompt_meta_path = _resolve_file_path(prompt_metadata)
464
  target_meta_path = _resolve_file_path(target_metadata)
465
- if not prompt_wav_path or not os.path.isfile(prompt_wav_path):
466
- gr.Warning(message=i18n("warn_missing_synthesis"))
467
- return None
468
- if not prompt_meta_path or not os.path.isfile(prompt_meta_path):
469
- gr.Warning(message=i18n("warn_missing_synthesis"))
470
- return None
471
- if not target_meta_path or not os.path.isfile(target_meta_path):
472
- gr.Warning(message=i18n("warn_missing_synthesis"))
473
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
474
  if control not in ("melody", "score"):
475
  control = "score"
476
  seed = int(seed)
477
  torch.manual_seed(seed)
478
  np.random.seed(seed)
479
  random.seed(seed)
 
480
  ok, msg, merged = APP_STATE.run_svs_from_paths(
481
- prompt_wav_path=prompt_wav_path,
482
  prompt_metadata_path=prompt_meta_path,
483
  target_metadata_path=target_meta_path,
484
  control=control,
@@ -487,24 +344,17 @@ def synthesis_function(
487
  )
488
  if not ok or merged is None:
489
  print(msg or "synthesis failed", file=sys.stderr, flush=True)
490
- return None
491
- return str(merged)
 
 
492
  except Exception:
493
  print(traceback.format_exc(), file=sys.stderr, flush=True)
494
- return None
495
-
496
-
497
- def _instruction_md() -> str:
498
- """Markdown content for the instruction panel (supports links)."""
499
- return "\n\n".join([
500
- f"**1.** {i18n('instruction_p1')}",
501
- f"**2.** {i18n('instruction_p2')}",
502
- f"**3.** {i18n('instruction_p3')}",
503
- ])
504
 
505
 
506
  def render_interface() -> gr.Blocks:
507
- with gr.Blocks(title="SoulX-Singer 歌声合成Demo", theme=gr.themes.Default()) as page:
508
  gr.HTML(
509
  '<div style="'
510
  'text-align: center; '
@@ -528,252 +378,139 @@ def render_interface() -> gr.Blocks:
528
  '"></div>'
529
  '</div>'
530
  )
531
- # Auto-detect browser language: run after Gradio mounts
532
- gr.HTML(
533
- '<script type="text/javascript">'
534
- '(function(){'
535
- 'function setLang(){'
536
- 'var lang=(navigator.language||navigator.userLanguage||"").toLowerCase();'
537
- 'if(lang.startsWith("en")){'
538
- 'var inputs=document.querySelectorAll("#lang_choice_radio input");'
539
- 'if(inputs.length>1)inputs[1].click();'
540
- '}'
541
- '}'
542
- 'if(document.readyState==="complete")setTimeout(setLang,800);'
543
- 'else window.addEventListener("load",function(){setTimeout(setLang,800);});'
544
- '})();'
545
- '</script>',
546
- visible=False,
547
- )
548
- with gr.Row(equal_height=True):
549
- lang_choice = gr.Radio(
550
- choices=["中文", "English"],
551
- value="中文",
552
- label=i18n("display_lang_label"),
553
- type="index",
554
- interactive=True,
555
- elem_id="lang_choice_radio",
556
- )
557
-
558
- # Instruction panel (usage workflow); updates on language change
559
- instruction_md = gr.Markdown(f"### {i18n('instruction_title')}\n\n{_instruction_md()}")
560
-
561
- # Reference examples — at the front of operations (handler registered after components exist)
562
- skip_clear_metadata_count = gr.State(0)
563
- with gr.Row():
564
- _example_choices = [i18n("example_choice_0"), i18n("example_choice_1"), i18n("example_choice_2"), i18n("example_choice_3"), i18n("example_choice_4")]
565
- example_choice = gr.Dropdown(
566
- label=i18n("examples_label"),
567
- choices=_example_choices,
568
- value=_example_choices[0],
569
- interactive=True,
570
- )
571
 
572
- # Step 1: Transcription (audio → metadata)
573
- with gr.Row(equal_height=True):
574
  with gr.Column(scale=1):
575
  prompt_audio = gr.Audio(
576
- label=i18n("prompt_audio_label"),
577
  type="filepath",
578
  editable=False,
579
  interactive=True,
580
  )
581
- with gr.Column(scale=1):
582
  target_audio = gr.Audio(
583
- label=i18n("target_audio_label"),
584
  type="filepath",
585
  editable=False,
586
  interactive=True,
587
  )
588
- with gr.Row(equal_height=True):
589
- prompt_lyric_lang = gr.Dropdown(
590
- label=i18n("prompt_lyric_lang_label"),
591
- choices=get_lyric_lang_choices(),
592
- value="Mandarin",
593
- interactive=True,
594
- scale=1,
595
- )
596
- target_lyric_lang = gr.Dropdown(
597
- label=i18n("target_lyric_lang_label"),
598
- choices=get_lyric_lang_choices(),
599
- value="Mandarin",
600
- interactive=True,
601
- scale=1,
602
- )
603
- prompt_vocal_sep = gr.Checkbox(
604
- label=i18n("prompt_vocal_sep_label"),
605
- value=False,
606
- interactive=True,
607
- scale=1,
608
- )
609
- target_vocal_sep = gr.Checkbox(
610
- label=i18n("target_vocal_sep_label"),
611
- value=True,
612
- interactive=True,
613
- scale=1,
614
- )
615
- with gr.Row():
616
- transcription_btn = gr.Button(
617
- value=i18n("transcription_btn_label"),
618
- variant="primary",
619
- size="lg",
620
- )
621
 
622
- # Edit tutorial link (gr.HTML supports links; component labels do not)
623
- metadata_tutorial_html = gr.HTML(value=i18n("edit_tutorial_html"))
624
- # Synthesis: params row, then synthesis button on next row
625
- with gr.Row(equal_height=True):
626
- prompt_metadata = gr.File(
627
- label=i18n("prompt_meta_label"),
628
- type="filepath",
629
- file_types=[".json"],
630
- interactive=True,
631
- )
632
- target_metadata = gr.File(
633
- label=i18n("target_meta_label"),
634
- type="filepath",
635
- file_types=[".json"],
636
- interactive=True,
637
- )
638
- control_radio = gr.Radio(
639
- choices=["melody", "score"],
640
- value="score",
641
- label=i18n("control_type_label"),
642
- scale=1,
643
- )
644
- auto_shift = gr.Checkbox(
645
- label=i18n("auto_shift_label"),
646
- value=True,
647
- interactive=True,
648
- scale=1,
649
- )
650
- pitch_shift = gr.Number(
651
- label=i18n("pitch_shift_label"),
652
- value=0,
653
- minimum=-36,
654
- maximum=36,
655
- step=1,
656
- interactive=True,
657
- scale=1,
658
- )
659
- seed_input = gr.Number(
660
- label=i18n("seed_label"),
661
- value=12306,
662
- step=1,
663
- interactive=True,
664
- scale=1,
665
- )
666
- with gr.Row():
667
- synthesis_btn = gr.Button(
668
- value=i18n("synthesis_btn_label"),
669
- variant="primary",
670
- size="lg",
671
- )
672
- with gr.Row():
673
- output_audio = gr.Audio(
674
- label=i18n("generated_audio_label"),
675
- type="filepath",
676
- interactive=False,
677
- )
678
-
679
- example_choice.change(
680
- fn=_load_example,
681
- inputs=[example_choice],
682
- outputs=[
683
- prompt_audio,
684
- target_audio,
685
- prompt_metadata,
686
- target_metadata,
687
- prompt_lyric_lang,
688
- target_lyric_lang,
689
- control_radio,
690
- prompt_vocal_sep,
691
- target_vocal_sep,
692
- auto_shift,
693
- pitch_shift,
694
- skip_clear_metadata_count,
695
- ],
696
- )
697
-
698
- def _change_component_language(lang):
699
- global global_lang
700
- global_lang = ["zh", "en"][lang]
701
- choices = get_lyric_lang_choices()
702
- return [
703
- gr.update(label=i18n("prompt_audio_label")),
704
- gr.update(label=i18n("target_audio_label")),
705
- gr.update(label=i18n("prompt_lyric_lang_label"), choices=choices),
706
- gr.update(label=i18n("target_lyric_lang_label"), choices=choices),
707
- gr.update(label=i18n("prompt_vocal_sep_label")),
708
- gr.update(label=i18n("target_vocal_sep_label")),
709
- gr.update(value=i18n("transcription_btn_label")),
710
- gr.update(label=i18n("prompt_meta_label")),
711
- gr.update(label=i18n("target_meta_label")),
712
- gr.update(value=i18n("edit_tutorial_html")),
713
- gr.update(label=i18n("control_type_label")),
714
- gr.update(label=i18n("auto_shift_label")),
715
- gr.update(label=i18n("pitch_shift_label")),
716
- gr.update(label=i18n("seed_label")),
717
- gr.update(value=i18n("synthesis_btn_label")),
718
- gr.update(label=i18n("generated_audio_label")),
719
- gr.update(label=i18n("display_lang_label")),
720
- gr.update(
721
- label=i18n("examples_label"),
722
- choices=[i18n("example_choice_0"), i18n("example_choice_1"), i18n("example_choice_2"), i18n("example_choice_3"), i18n("example_choice_4")],
723
- value=i18n("example_choice_0"),
724
- ),
725
- gr.update(value=f"### {i18n('instruction_title')}\n\n{_instruction_md()}"),
726
- ]
727
-
728
- lang_choice.change(
729
- fn=_change_component_language,
730
- inputs=[lang_choice],
731
- outputs=[
732
- prompt_audio,
733
- target_audio,
734
- prompt_lyric_lang,
735
- target_lyric_lang,
736
- prompt_vocal_sep,
737
- target_vocal_sep,
738
- transcription_btn,
739
- prompt_metadata,
740
- target_metadata,
741
- metadata_tutorial_html,
742
- control_radio,
743
- auto_shift,
744
- pitch_shift,
745
- seed_input,
746
- synthesis_btn,
747
- output_audio,
748
- lang_choice,
749
- example_choice,
750
- instruction_md,
751
- ],
752
- )
753
 
754
- # Upload new prompt/target audio clear corresponding metadata; skip clear when change came from load example
755
- prompt_audio.change(
756
- fn=_clear_prompt_meta_unless_example,
757
- inputs=[prompt_audio, skip_clear_metadata_count],
758
- outputs=[prompt_metadata, skip_clear_metadata_count],
759
- )
760
- target_audio.change(
761
- fn=_clear_target_meta_unless_example,
762
- inputs=[target_audio, skip_clear_metadata_count],
763
- outputs=[target_metadata, skip_clear_metadata_count],
764
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
765
 
 
766
  transcription_btn.click(
767
  fn=transcription_function,
768
  inputs=[
769
- prompt_audio,
770
- target_audio,
771
- prompt_metadata,
772
- target_metadata,
773
- prompt_lyric_lang,
774
- target_lyric_lang,
775
- prompt_vocal_sep,
776
- target_vocal_sep,
777
  ],
778
  outputs=[prompt_metadata, target_metadata],
779
  )
@@ -781,15 +518,13 @@ def render_interface() -> gr.Blocks:
781
  synthesis_btn.click(
782
  fn=synthesis_function,
783
  inputs=[
784
- prompt_audio,
785
- prompt_metadata,
786
- target_metadata,
787
- control_radio,
788
- auto_shift,
789
- pitch_shift,
790
- seed_input,
791
  ],
792
- outputs=[output_audio],
793
  )
794
 
795
  return page
@@ -804,4 +539,4 @@ if __name__ == "__main__":
804
 
805
  page = render_interface()
806
  page.queue()
807
- page.launch(share=args.share, server_name="0.0.0.0", server_port=args.port)
 
5
  import sys
6
  import traceback
7
  from pathlib import Path
8
+ from typing import Tuple
9
  import spaces
10
 
11
  import numpy as np
 
21
 
22
  ROOT = Path(__file__).parent
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  def _get_device() -> str:
 
 
 
 
 
 
26
  if torch.cuda.is_available():
27
  return "cuda:0"
28
  try:
 
46
  self.device = _get_device()
47
  self.preprocess_pipeline = PreprocessPipeline(
48
  device=self.device,
49
+ language="English",
50
  save_dir=str(ROOT / "outputs" / "gradio" / "_placeholder" / "transcriptions"),
51
  vocal_sep=True,
52
  max_merge_duration=60000,
 
76
  audio_path=str(prompt_path),
77
  vocal_sep=prompt_vocal_sep,
78
  max_merge_duration=20000,
79
+ language=prompt_lyric_lang or "English",
80
  )
81
  self.preprocess_pipeline.save_dir = str(session_base / "transcriptions" / "target")
82
  self.preprocess_pipeline.run(
83
  audio_path=str(target_path),
84
  vocal_sep=target_vocal_sep,
85
  max_merge_duration=60000,
86
+ language=target_lyric_lang or "English",
87
  )
88
  return True, "preprocess done"
89
  except Exception as e:
 
100
  control = "score"
101
  save_dir = session_base / "generated"
102
  save_dir.mkdir(parents=True, exist_ok=True)
103
+
104
  class Args:
105
  pass
106
+
107
  args = Args()
108
  args.device = self.device
109
  args.model_path = "pretrained_models/SoulX-Singer/model.pt"
 
137
  pitch_shift: int,
138
  save_dir: Path | None = None,
139
  ) -> Tuple[bool, str, Path | None]:
 
140
  if save_dir is None:
141
  import uuid
142
  save_dir = ROOT / "outputs" / "gradio" / "synthesis" / str(uuid.uuid4())[:8]
 
167
  APP_STATE = AppState()
168
 
169
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
  def _resolve_file_path(x):
 
171
  if x is None:
172
  return None
173
  if isinstance(x, tuple):
174
  x = x[0]
175
  return x if (x and os.path.isfile(x)) else None
176
 
177
+
178
+ def _run_transcription_internal(
179
+ prompt_audio, target_audio,
180
+ prompt_lyric_lang, target_lyric_lang,
181
+ prompt_vocal_sep, target_vocal_sep,
182
+ ):
183
+ """Run transcription, return (prompt_meta_path, target_meta_path) or (None, None)."""
184
+ if isinstance(prompt_audio, tuple):
185
+ prompt_audio = prompt_audio[0]
186
+ if isinstance(target_audio, tuple):
187
+ target_audio = target_audio[0]
188
+
189
+ session_base = _session_dir_from_target(target_audio)
190
+ audio_dir = session_base / "audio"
191
+ audio_dir.mkdir(parents=True, exist_ok=True)
192
+
193
+ SR = 44100
194
+ PROMPT_MAX_SEC = 30
195
+ TARGET_MAX_SEC = 60
196
+ prompt_audio_data, _ = librosa.load(prompt_audio, sr=SR, mono=True)
197
+ target_audio_data, _ = librosa.load(target_audio, sr=SR, mono=True)
198
+ prompt_audio_data = prompt_audio_data[: PROMPT_MAX_SEC * SR]
199
+ target_audio_data = target_audio_data[: TARGET_MAX_SEC * SR]
200
+ sf.write(audio_dir / "prompt.wav", prompt_audio_data, SR)
201
+ sf.write(audio_dir / "target.wav", target_audio_data, SR)
202
+
203
+ ok, msg = APP_STATE.run_preprocess(
204
+ audio_dir / "prompt.wav",
205
+ audio_dir / "target.wav",
206
+ session_base,
207
+ prompt_vocal_sep=prompt_vocal_sep,
208
+ target_vocal_sep=target_vocal_sep,
209
+ prompt_lyric_lang=prompt_lyric_lang or "English",
210
+ target_lyric_lang=target_lyric_lang or "English",
211
+ )
212
+ if not ok:
213
+ print(msg, file=sys.stderr, flush=True)
214
+ return None, None
215
+
216
+ prompt_meta_path = session_base / "transcriptions" / "prompt" / "metadata.json"
217
+ target_meta_path = session_base / "transcriptions" / "target" / "metadata.json"
218
+ p = str(prompt_meta_path) if prompt_meta_path.exists() else None
219
+ t = str(target_meta_path) if target_meta_path.exists() else None
220
+ return p, t
221
+
222
+
223
  @spaces.GPU
224
  def transcription_function(
225
+ prompt_audio, target_audio,
226
+ prompt_metadata, target_metadata,
227
+ prompt_lyric_lang, target_lyric_lang,
228
+ prompt_vocal_sep, target_vocal_sep,
 
 
 
 
229
  ):
230
  """Step 1: Run transcription only; output (prompt_meta_path, target_meta_path)."""
231
  try:
 
234
  if isinstance(target_audio, tuple):
235
  target_audio = target_audio[0]
236
  if prompt_audio is None or target_audio is None:
237
+ gr.Warning(message="Please upload both prompt audio and target audio")
238
  return None, None
239
+
240
  prompt_meta_resolved = _resolve_file_path(prompt_metadata)
241
  target_meta_resolved = _resolve_file_path(target_metadata)
242
  use_input_metadata = prompt_meta_resolved is not None and target_meta_resolved is not None
243
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
  if use_input_metadata:
245
+ session_base = _session_dir_from_target(target_audio)
246
+ audio_dir = session_base / "audio"
247
+ audio_dir.mkdir(parents=True, exist_ok=True)
248
+ SR = 44100
249
+ prompt_audio_data, _ = librosa.load(prompt_audio, sr=SR, mono=True)
250
+ target_audio_data, _ = librosa.load(target_audio, sr=SR, mono=True)
251
+ prompt_audio_data = prompt_audio_data[: 30 * SR]
252
+ target_audio_data = target_audio_data[: 60 * SR]
253
+ sf.write(audio_dir / "prompt.wav", prompt_audio_data, SR)
254
+ sf.write(audio_dir / "target.wav", target_audio_data, SR)
255
+
256
+ prompt_meta_path = session_base / "transcriptions" / "prompt" / "metadata.json"
257
+ target_meta_path = session_base / "transcriptions" / "target" / "metadata.json"
258
  (session_base / "transcriptions" / "prompt").mkdir(parents=True, exist_ok=True)
259
  (session_base / "transcriptions" / "target").mkdir(parents=True, exist_ok=True)
260
  shutil.copy2(prompt_meta_resolved, prompt_meta_path)
261
  shutil.copy2(target_meta_resolved, target_meta_path)
262
+ return str(prompt_meta_path), str(target_meta_path)
263
  else:
264
+ return _run_transcription_internal(
265
+ prompt_audio, target_audio,
266
+ prompt_lyric_lang, target_lyric_lang,
267
+ prompt_vocal_sep, target_vocal_sep,
 
 
 
 
268
  )
 
 
 
 
 
 
 
269
  except Exception:
270
  print(traceback.format_exc(), file=sys.stderr, flush=True)
271
  return None, None
272
 
273
+
274
  @spaces.GPU
275
  def synthesis_function(
276
  prompt_audio,
277
+ target_audio,
278
  prompt_metadata,
279
  target_metadata,
280
+ control,
281
+ auto_shift,
282
  pitch_shift,
283
+ seed,
284
+ prompt_lyric_lang,
285
+ target_lyric_lang,
286
+ prompt_vocal_sep,
287
+ target_vocal_sep,
288
  ):
289
+ """Single-button: runs transcription first if metadata not provided, then synthesis."""
290
  try:
291
  if isinstance(prompt_audio, tuple):
292
  prompt_audio = prompt_audio[0]
293
+ if isinstance(target_audio, tuple):
294
+ target_audio = target_audio[0]
295
+
296
+ if not prompt_audio or not os.path.isfile(prompt_audio):
297
+ gr.Warning(message="Please upload both prompt audio and target audio")
298
+ return None, gr.update(), gr.update()
299
+ if not target_audio or not os.path.isfile(target_audio):
300
+ gr.Warning(message="Please upload both prompt audio and target audio")
301
+ return None, gr.update(), gr.update()
302
+
303
  prompt_meta_path = _resolve_file_path(prompt_metadata)
304
  target_meta_path = _resolve_file_path(target_metadata)
305
+
306
+ # Auto-run transcription if metadata not provided
307
+ if not prompt_meta_path or not target_meta_path:
308
+ p, t = _run_transcription_internal(
309
+ prompt_audio, target_audio,
310
+ prompt_lyric_lang, target_lyric_lang,
311
+ prompt_vocal_sep, target_vocal_sep,
312
+ )
313
+ if not p or not t:
314
+ gr.Warning(message="Transcription failed. Check your audio files.")
315
+ return None, gr.update(), gr.update()
316
+ prompt_meta_path = p
317
+ target_meta_path = t
318
+
319
+ # Prepare prompt wav
320
+ session_base = _session_dir_from_target(target_audio)
321
+ prompt_wav = session_base / "audio" / "prompt.wav"
322
+ if not prompt_wav.exists():
323
+ audio_dir = session_base / "audio"
324
+ audio_dir.mkdir(parents=True, exist_ok=True)
325
+ SR = 44100
326
+ data, _ = librosa.load(prompt_audio, sr=SR, mono=True)
327
+ data = data[: 30 * SR]
328
+ sf.write(prompt_wav, data, SR)
329
+
330
  if control not in ("melody", "score"):
331
  control = "score"
332
  seed = int(seed)
333
  torch.manual_seed(seed)
334
  np.random.seed(seed)
335
  random.seed(seed)
336
+
337
  ok, msg, merged = APP_STATE.run_svs_from_paths(
338
+ prompt_wav_path=str(prompt_wav),
339
  prompt_metadata_path=prompt_meta_path,
340
  target_metadata_path=target_meta_path,
341
  control=control,
 
344
  )
345
  if not ok or merged is None:
346
  print(msg or "synthesis failed", file=sys.stderr, flush=True)
347
+ return None, gr.update(), gr.update()
348
+
349
+ # Return generated audio + update metadata displays
350
+ return str(merged), prompt_meta_path, target_meta_path
351
  except Exception:
352
  print(traceback.format_exc(), file=sys.stderr, flush=True)
353
+ return None, gr.update(), gr.update()
 
 
 
 
 
 
 
 
 
354
 
355
 
356
  def render_interface() -> gr.Blocks:
357
+ with gr.Blocks(title="SoulX-Singer", theme=gr.themes.Default()) as page:
358
  gr.HTML(
359
  '<div style="'
360
  'text-align: center; '
 
378
  '"></div>'
379
  '</div>'
380
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
381
 
382
+ with gr.Row(equal_height=False):
383
+ # ── Left column: inputs & controls ──
384
  with gr.Column(scale=1):
385
  prompt_audio = gr.Audio(
386
+ label="Prompt audio (reference voice), max 30s",
387
  type="filepath",
388
  editable=False,
389
  interactive=True,
390
  )
 
391
  target_audio = gr.Audio(
392
+ label="Target audio (melody / lyrics source), max 60s",
393
  type="filepath",
394
  editable=False,
395
  interactive=True,
396
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
397
 
398
+ with gr.Row():
399
+ control_radio = gr.Radio(
400
+ choices=["melody", "score"],
401
+ value="score",
402
+ label="Control type",
403
+ scale=1,
404
+ )
405
+ auto_shift = gr.Checkbox(
406
+ label="Auto pitch shift",
407
+ value=True,
408
+ interactive=True,
409
+ scale=1,
410
+ )
411
+ with gr.Row():
412
+ pitch_shift = gr.Number(
413
+ label="Pitch shift (semitones)",
414
+ value=0,
415
+ minimum=-36,
416
+ maximum=36,
417
+ step=1,
418
+ interactive=True,
419
+ scale=1,
420
+ )
421
+ seed_input = gr.Number(
422
+ label="Seed",
423
+ value=12306,
424
+ step=1,
425
+ interactive=True,
426
+ scale=1,
427
+ )
428
+
429
+ synthesis_btn = gr.Button(
430
+ value="🎤 Generate singing voice",
431
+ variant="primary",
432
+ size="lg",
433
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
434
 
435
+ # ── Advanced: transcription settings & metadata ──
436
+ with gr.Accordion("Advanced: Transcription & Metadata", open=False):
437
+ gr.Markdown(
438
+ "Upload your own metadata files to skip automatic transcription. "
439
+ "You can use the [SoulX-Singer-Midi-Editor]"
440
+ "(https://huggingface.co/spaces/Soul-AILab/SoulX-Singer-Midi-Editor) "
441
+ "to edit metadata for better alignment."
442
+ )
443
+ with gr.Row():
444
+ prompt_lyric_lang = gr.Dropdown(
445
+ label="Prompt lyric language",
446
+ choices=[
447
+ ("Mandarin", "Mandarin"),
448
+ ("Cantonese", "Cantonese"),
449
+ ("English", "English"),
450
+ ],
451
+ value="English",
452
+ interactive=True,
453
+ scale=1,
454
+ )
455
+ target_lyric_lang = gr.Dropdown(
456
+ label="Target lyric language",
457
+ choices=[
458
+ ("Mandarin", "Mandarin"),
459
+ ("Cantonese", "Cantonese"),
460
+ ("English", "English"),
461
+ ],
462
+ value="English",
463
+ interactive=True,
464
+ scale=1,
465
+ )
466
+ with gr.Row():
467
+ prompt_vocal_sep = gr.Checkbox(
468
+ label="Prompt vocal separation",
469
+ value=False,
470
+ interactive=True,
471
+ scale=1,
472
+ )
473
+ target_vocal_sep = gr.Checkbox(
474
+ label="Target vocal separation",
475
+ value=True,
476
+ interactive=True,
477
+ scale=1,
478
+ )
479
+ transcription_btn = gr.Button(
480
+ value="Run singing transcription",
481
+ variant="secondary",
482
+ size="lg",
483
+ )
484
+ with gr.Row():
485
+ prompt_metadata = gr.File(
486
+ label="Prompt metadata",
487
+ type="filepath",
488
+ file_types=[".json"],
489
+ interactive=True,
490
+ )
491
+ target_metadata = gr.File(
492
+ label="Target metadata",
493
+ type="filepath",
494
+ file_types=[".json"],
495
+ interactive=True,
496
+ )
497
+
498
+ # ── Right column: output ──
499
+ with gr.Column(scale=1):
500
+ output_audio = gr.Audio(
501
+ label="Generated audio",
502
+ type="filepath",
503
+ interactive=False,
504
+ )
505
 
506
+ # ── Event handlers ──
507
  transcription_btn.click(
508
  fn=transcription_function,
509
  inputs=[
510
+ prompt_audio, target_audio,
511
+ prompt_metadata, target_metadata,
512
+ prompt_lyric_lang, target_lyric_lang,
513
+ prompt_vocal_sep, target_vocal_sep,
 
 
 
 
514
  ],
515
  outputs=[prompt_metadata, target_metadata],
516
  )
 
518
  synthesis_btn.click(
519
  fn=synthesis_function,
520
  inputs=[
521
+ prompt_audio, target_audio,
522
+ prompt_metadata, target_metadata,
523
+ control_radio, auto_shift, pitch_shift, seed_input,
524
+ prompt_lyric_lang, target_lyric_lang,
525
+ prompt_vocal_sep, target_vocal_sep,
 
 
526
  ],
527
+ outputs=[output_audio, prompt_metadata, target_metadata],
528
  )
529
 
530
  return page
 
539
 
540
  page = render_interface()
541
  page.queue()
542
+ page.launch(share=args.share, server_name="0.0.0.0", server_port=args.port)