Nekochu commited on
Commit
d6a3e45
·
1 Parent(s): 4619f39

wire fast captioning (CLAP+Whisper+VAD) into training, add LM caption checkbox

Browse files
Files changed (1) hide show
  1. app.py +74 -25
app.py CHANGED
@@ -538,7 +538,7 @@ def gradio_main():
538
  return "\n".join(lines)
539
 
540
  # -- Training generator (direct integration, no subprocess) --
541
- def train_lora_ui(audio_files, lora_name, epochs, lr, rank):
542
  """Generator that yields (train_log, train_btn_update, cancel_btn_update)."""
543
  import gc as _gc
544
 
@@ -642,14 +642,9 @@ def gradio_main():
642
  f"Epochs: {epochs} | LR: {lr} | Rank: {rank}")
643
  yield _log_text(), gr.Button(visible=False), gr.Button(visible=True), gr.File()
644
 
645
- # Caption audio files: GGUF LM if ace-server running, else librosa
646
- use_understand = _server_ok()
647
- method = "GGUF LM (BPM, key, mood, lyrics)" if use_understand else "librosa (BPM only)"
648
- _log(f"[INFO] Auto-captioning via {method}...")
649
- yield _log_text(), gr.Button(visible=False), gr.Button(visible=True), gr.File()
650
  for audio_fname in sorted(os.listdir(audio_dir)):
651
- if _training_cancel.is_set():
652
- break
653
  full_path = os.path.join(audio_dir, audio_fname)
654
  if not os.path.isfile(full_path):
655
  continue
@@ -661,28 +656,78 @@ def gradio_main():
661
  sidecar_txt = os.path.join(audio_dir, stem + ".txt")
662
  if os.path.isfile(sidecar_json) or os.path.isfile(sidecar_txt):
663
  _log(f" {audio_fname}: using caption file")
664
- yield _log_text(), gr.Button(visible=False), gr.Button(visible=True), gr.File()
665
  continue
666
- caption_data = None
667
- if use_understand:
668
- _log(f" {audio_fname}: GGUF LM captioning...")
 
 
 
 
 
 
 
 
 
 
 
 
669
  yield _log_text(), gr.Button(visible=False), gr.Button(visible=True), gr.File()
670
  caption_data = _caption_via_understand(
671
- full_path, timeout=600,
672
  cancel_check=lambda: _training_cancel.is_set(),
673
  )
674
- if not caption_data:
675
- use_understand = False
676
- _log(f" {audio_fname}: GGUF LM too slow, skipping (preprocessing will analyze)")
677
- if caption_data:
678
- bpm_s = caption_data.get("bpm", "?")
679
- key_s = caption_data.get("keyscale", caption_data.get("key", "?"))
680
- _log(f" {audio_fname}: OK (BPM={bpm_s}, key={key_s})")
681
- with open(sidecar_json, "w") as cj:
682
- json.dump(caption_data, cj)
683
- else:
684
- _log(f" {audio_fname}: will be analyzed in preprocessing (BPM + key + caption)")
 
 
 
685
  yield _log_text(), gr.Button(visible=False), gr.Button(visible=True), gr.File()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
686
 
687
  if _training_cancel.is_set():
688
  _training_cancel.clear()
@@ -952,6 +997,10 @@ def gradio_main():
952
  label="Rank (r)", minimum=1, maximum=128,
953
  value=16, step=1,
954
  )
 
 
 
 
955
 
956
  # Button swap on click (separate handler, like rvc-beatrice)
957
  # This fires immediately so user sees Cancel even if training
@@ -964,7 +1013,7 @@ def gradio_main():
964
  # Training generator -- yields (log, train_btn, cancel_btn, output_file)
965
  train_event = train_btn.click(
966
  train_lora_ui,
967
- inputs=[train_audio, lora_name, train_epochs, train_lr, train_rank],
968
  outputs=[train_log, train_btn, cancel_btn, train_output_file],
969
  api_name="train_lora",
970
  concurrency_limit=1,
 
538
  return "\n".join(lines)
539
 
540
  # -- Training generator (direct integration, no subprocess) --
541
+ def train_lora_ui(audio_files, lora_name, epochs, lr, rank, use_lm_caption):
542
  """Generator that yields (train_log, train_btn_update, cancel_btn_update)."""
543
  import gc as _gc
544
 
 
642
  f"Epochs: {epochs} | LR: {lr} | Rank: {rank}")
643
  yield _log_text(), gr.Button(visible=False), gr.Button(visible=True), gr.File()
644
 
645
+ # Caption audio files without user-provided sidecars
646
+ audio_to_caption = []
 
 
 
647
  for audio_fname in sorted(os.listdir(audio_dir)):
 
 
648
  full_path = os.path.join(audio_dir, audio_fname)
649
  if not os.path.isfile(full_path):
650
  continue
 
656
  sidecar_txt = os.path.join(audio_dir, stem + ".txt")
657
  if os.path.isfile(sidecar_json) or os.path.isfile(sidecar_txt):
658
  _log(f" {audio_fname}: using caption file")
 
659
  continue
660
+ audio_to_caption.append((audio_fname, full_path, sidecar_json))
661
+ yield _log_text(), gr.Button(visible=False), gr.Button(visible=True), gr.File()
662
+
663
+ if audio_to_caption and use_lm_caption and _server_ok():
664
+ # --- Mode: GGUF LM captioning (slow, best quality) ---
665
+ est_total = int(total_dur * 7 + len(audio_to_caption) * 600)
666
+ _log(f"[INFO] LM captioning {len(audio_to_caption)} files "
667
+ f"(estimated ~{est_total // 60} min)...")
668
+ yield _log_text(), gr.Button(visible=False), gr.Button(visible=True), gr.File()
669
+ for audio_fname, full_path, sidecar_json in audio_to_caption:
670
+ if _training_cancel.is_set():
671
+ break
672
+ file_dur = _lr.get_duration(path=full_path)
673
+ file_timeout = int(file_dur * 7 + 600)
674
+ _log(f" {audio_fname}: LM captioning (timeout {file_timeout // 60} min)...")
675
  yield _log_text(), gr.Button(visible=False), gr.Button(visible=True), gr.File()
676
  caption_data = _caption_via_understand(
677
+ full_path, timeout=file_timeout,
678
  cancel_check=lambda: _training_cancel.is_set(),
679
  )
680
+ if caption_data:
681
+ bpm_s = caption_data.get("bpm", "?")
682
+ key_s = caption_data.get("keyscale", caption_data.get("key", "?"))
683
+ _log(f" {audio_fname}: OK (BPM={bpm_s}, key={key_s})")
684
+ with open(sidecar_json, "w") as cj:
685
+ json.dump(caption_data, cj)
686
+ else:
687
+ _log(f" {audio_fname}: LM failed, will use fast captioning")
688
+ yield _log_text(), gr.Button(visible=False), gr.Button(visible=True), gr.File()
689
+
690
+ elif audio_to_caption:
691
+ # --- Mode: Fast captioning (CLAP + Whisper + librosa) ---
692
+ _log(f"[INFO] Fast captioning {len(audio_to_caption)} files "
693
+ f"(CLAP tags + lyrics + BPM)...")
694
  yield _log_text(), gr.Button(visible=False), gr.Button(visible=True), gr.File()
695
+ try:
696
+ from caption_fast import caption_audio, unload_caption_models
697
+ for audio_fname, full_path, sidecar_json in audio_to_caption:
698
+ if _training_cancel.is_set():
699
+ break
700
+ _log(f" {audio_fname}: analyzing...")
701
+ yield _log_text(), gr.Button(visible=False), gr.Button(visible=True), gr.File()
702
+ try:
703
+ result = caption_audio(full_path)
704
+ _log(f" {audio_fname}: {result.get('caption', '')[:60]}")
705
+ if result.get("lyrics") and result["lyrics"] != "[Instrumental]":
706
+ _log(f" {audio_fname}: lyrics extracted ({len(result['lyrics'])} chars)")
707
+ with open(sidecar_json, "w") as cj:
708
+ json.dump(result, cj)
709
+ except Exception as cap_exc:
710
+ _log(f" {audio_fname}: fast caption failed: {cap_exc}")
711
+ yield _log_text(), gr.Button(visible=False), gr.Button(visible=True), gr.File()
712
+ unload_caption_models()
713
+ _gc.collect()
714
+ except ImportError:
715
+ _log("[WARN] Fast captioning not available, using librosa fallback")
716
+ for audio_fname, full_path, sidecar_json in audio_to_caption:
717
+ try:
718
+ y_cap, sr_cap = _lr.load(full_path, sr=None, mono=True)
719
+ tempo_arr, _ = _lr.beat.beat_track(y=y_cap, sr=sr_cap)
720
+ bpm_val = int(round(float(
721
+ tempo_arr.item() if hasattr(tempo_arr, 'item') else tempo_arr)))
722
+ fallback = {"caption": audio_fname.rsplit(".", 1)[0],
723
+ "bpm": str(bpm_val), "key": "", "signature": "4/4",
724
+ "lyrics": "[Instrumental]"}
725
+ with open(sidecar_json, "w") as cj:
726
+ json.dump(fallback, cj)
727
+ _log(f" {audio_fname}: librosa BPM={bpm_val}")
728
+ except Exception as exc:
729
+ _log(f" {audio_fname}: failed: {exc}")
730
+ yield _log_text(), gr.Button(visible=False), gr.Button(visible=True), gr.File()
731
 
732
  if _training_cancel.is_set():
733
  _training_cancel.clear()
 
997
  label="Rank (r)", minimum=1, maximum=128,
998
  value=16, step=1,
999
  )
1000
+ use_lm_caption = gr.Checkbox(
1001
+ label="Use LM captioning (best quality, ~30 min/file)",
1002
+ value=False,
1003
+ )
1004
 
1005
  # Button swap on click (separate handler, like rvc-beatrice)
1006
  # This fires immediately so user sees Cancel even if training
 
1013
  # Training generator -- yields (log, train_btn, cancel_btn, output_file)
1014
  train_event = train_btn.click(
1015
  train_lora_ui,
1016
+ inputs=[train_audio, lora_name, train_epochs, train_lr, train_rank, use_lm_caption],
1017
  outputs=[train_log, train_btn, cancel_btn, train_output_file],
1018
  api_name="train_lora",
1019
  concurrency_limit=1,