Spaces:
Running
Running
wire fast captioning (CLAP+Whisper+VAD) into training, add LM caption checkbox
Browse files
app.py
CHANGED
|
@@ -538,7 +538,7 @@ def gradio_main():
|
|
| 538 |
return "\n".join(lines)
|
| 539 |
|
| 540 |
# -- Training generator (direct integration, no subprocess) --
|
| 541 |
-
def train_lora_ui(audio_files, lora_name, epochs, lr, rank):
|
| 542 |
"""Generator that yields (train_log, train_btn_update, cancel_btn_update)."""
|
| 543 |
import gc as _gc
|
| 544 |
|
|
@@ -642,14 +642,9 @@ def gradio_main():
|
|
| 642 |
f"Epochs: {epochs} | LR: {lr} | Rank: {rank}")
|
| 643 |
yield _log_text(), gr.Button(visible=False), gr.Button(visible=True), gr.File()
|
| 644 |
|
| 645 |
-
# Caption audio files
|
| 646 |
-
|
| 647 |
-
method = "GGUF LM (BPM, key, mood, lyrics)" if use_understand else "librosa (BPM only)"
|
| 648 |
-
_log(f"[INFO] Auto-captioning via {method}...")
|
| 649 |
-
yield _log_text(), gr.Button(visible=False), gr.Button(visible=True), gr.File()
|
| 650 |
for audio_fname in sorted(os.listdir(audio_dir)):
|
| 651 |
-
if _training_cancel.is_set():
|
| 652 |
-
break
|
| 653 |
full_path = os.path.join(audio_dir, audio_fname)
|
| 654 |
if not os.path.isfile(full_path):
|
| 655 |
continue
|
|
@@ -661,28 +656,78 @@ def gradio_main():
|
|
| 661 |
sidecar_txt = os.path.join(audio_dir, stem + ".txt")
|
| 662 |
if os.path.isfile(sidecar_json) or os.path.isfile(sidecar_txt):
|
| 663 |
_log(f" {audio_fname}: using caption file")
|
| 664 |
-
yield _log_text(), gr.Button(visible=False), gr.Button(visible=True), gr.File()
|
| 665 |
continue
|
| 666 |
-
|
| 667 |
-
|
| 668 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 669 |
yield _log_text(), gr.Button(visible=False), gr.Button(visible=True), gr.File()
|
| 670 |
caption_data = _caption_via_understand(
|
| 671 |
-
full_path, timeout=
|
| 672 |
cancel_check=lambda: _training_cancel.is_set(),
|
| 673 |
)
|
| 674 |
-
if
|
| 675 |
-
|
| 676 |
-
|
| 677 |
-
|
| 678 |
-
|
| 679 |
-
|
| 680 |
-
|
| 681 |
-
|
| 682 |
-
|
| 683 |
-
|
| 684 |
-
|
|
|
|
|
|
|
|
|
|
| 685 |
yield _log_text(), gr.Button(visible=False), gr.Button(visible=True), gr.File()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 686 |
|
| 687 |
if _training_cancel.is_set():
|
| 688 |
_training_cancel.clear()
|
|
@@ -952,6 +997,10 @@ def gradio_main():
|
|
| 952 |
label="Rank (r)", minimum=1, maximum=128,
|
| 953 |
value=16, step=1,
|
| 954 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 955 |
|
| 956 |
# Button swap on click (separate handler, like rvc-beatrice)
|
| 957 |
# This fires immediately so user sees Cancel even if training
|
|
@@ -964,7 +1013,7 @@ def gradio_main():
|
|
| 964 |
# Training generator -- yields (log, train_btn, cancel_btn, output_file)
|
| 965 |
train_event = train_btn.click(
|
| 966 |
train_lora_ui,
|
| 967 |
-
inputs=[train_audio, lora_name, train_epochs, train_lr, train_rank],
|
| 968 |
outputs=[train_log, train_btn, cancel_btn, train_output_file],
|
| 969 |
api_name="train_lora",
|
| 970 |
concurrency_limit=1,
|
|
|
|
| 538 |
return "\n".join(lines)
|
| 539 |
|
| 540 |
# -- Training generator (direct integration, no subprocess) --
|
| 541 |
+
def train_lora_ui(audio_files, lora_name, epochs, lr, rank, use_lm_caption):
|
| 542 |
"""Generator that yields (train_log, train_btn_update, cancel_btn_update)."""
|
| 543 |
import gc as _gc
|
| 544 |
|
|
|
|
| 642 |
f"Epochs: {epochs} | LR: {lr} | Rank: {rank}")
|
| 643 |
yield _log_text(), gr.Button(visible=False), gr.Button(visible=True), gr.File()
|
| 644 |
|
| 645 |
+
# Caption audio files without user-provided sidecars
|
| 646 |
+
audio_to_caption = []
|
|
|
|
|
|
|
|
|
|
| 647 |
for audio_fname in sorted(os.listdir(audio_dir)):
|
|
|
|
|
|
|
| 648 |
full_path = os.path.join(audio_dir, audio_fname)
|
| 649 |
if not os.path.isfile(full_path):
|
| 650 |
continue
|
|
|
|
| 656 |
sidecar_txt = os.path.join(audio_dir, stem + ".txt")
|
| 657 |
if os.path.isfile(sidecar_json) or os.path.isfile(sidecar_txt):
|
| 658 |
_log(f" {audio_fname}: using caption file")
|
|
|
|
| 659 |
continue
|
| 660 |
+
audio_to_caption.append((audio_fname, full_path, sidecar_json))
|
| 661 |
+
yield _log_text(), gr.Button(visible=False), gr.Button(visible=True), gr.File()
|
| 662 |
+
|
| 663 |
+
if audio_to_caption and use_lm_caption and _server_ok():
|
| 664 |
+
# --- Mode: GGUF LM captioning (slow, best quality) ---
|
| 665 |
+
est_total = int(total_dur * 7 + len(audio_to_caption) * 600)
|
| 666 |
+
_log(f"[INFO] LM captioning {len(audio_to_caption)} files "
|
| 667 |
+
f"(estimated ~{est_total // 60} min)...")
|
| 668 |
+
yield _log_text(), gr.Button(visible=False), gr.Button(visible=True), gr.File()
|
| 669 |
+
for audio_fname, full_path, sidecar_json in audio_to_caption:
|
| 670 |
+
if _training_cancel.is_set():
|
| 671 |
+
break
|
| 672 |
+
file_dur = _lr.get_duration(path=full_path)
|
| 673 |
+
file_timeout = int(file_dur * 7 + 600)
|
| 674 |
+
_log(f" {audio_fname}: LM captioning (timeout {file_timeout // 60} min)...")
|
| 675 |
yield _log_text(), gr.Button(visible=False), gr.Button(visible=True), gr.File()
|
| 676 |
caption_data = _caption_via_understand(
|
| 677 |
+
full_path, timeout=file_timeout,
|
| 678 |
cancel_check=lambda: _training_cancel.is_set(),
|
| 679 |
)
|
| 680 |
+
if caption_data:
|
| 681 |
+
bpm_s = caption_data.get("bpm", "?")
|
| 682 |
+
key_s = caption_data.get("keyscale", caption_data.get("key", "?"))
|
| 683 |
+
_log(f" {audio_fname}: OK (BPM={bpm_s}, key={key_s})")
|
| 684 |
+
with open(sidecar_json, "w") as cj:
|
| 685 |
+
json.dump(caption_data, cj)
|
| 686 |
+
else:
|
| 687 |
+
_log(f" {audio_fname}: LM failed, will use fast captioning")
|
| 688 |
+
yield _log_text(), gr.Button(visible=False), gr.Button(visible=True), gr.File()
|
| 689 |
+
|
| 690 |
+
elif audio_to_caption:
|
| 691 |
+
# --- Mode: Fast captioning (CLAP + Whisper + librosa) ---
|
| 692 |
+
_log(f"[INFO] Fast captioning {len(audio_to_caption)} files "
|
| 693 |
+
f"(CLAP tags + lyrics + BPM)...")
|
| 694 |
yield _log_text(), gr.Button(visible=False), gr.Button(visible=True), gr.File()
|
| 695 |
+
try:
|
| 696 |
+
from caption_fast import caption_audio, unload_caption_models
|
| 697 |
+
for audio_fname, full_path, sidecar_json in audio_to_caption:
|
| 698 |
+
if _training_cancel.is_set():
|
| 699 |
+
break
|
| 700 |
+
_log(f" {audio_fname}: analyzing...")
|
| 701 |
+
yield _log_text(), gr.Button(visible=False), gr.Button(visible=True), gr.File()
|
| 702 |
+
try:
|
| 703 |
+
result = caption_audio(full_path)
|
| 704 |
+
_log(f" {audio_fname}: {result.get('caption', '')[:60]}")
|
| 705 |
+
if result.get("lyrics") and result["lyrics"] != "[Instrumental]":
|
| 706 |
+
_log(f" {audio_fname}: lyrics extracted ({len(result['lyrics'])} chars)")
|
| 707 |
+
with open(sidecar_json, "w") as cj:
|
| 708 |
+
json.dump(result, cj)
|
| 709 |
+
except Exception as cap_exc:
|
| 710 |
+
_log(f" {audio_fname}: fast caption failed: {cap_exc}")
|
| 711 |
+
yield _log_text(), gr.Button(visible=False), gr.Button(visible=True), gr.File()
|
| 712 |
+
unload_caption_models()
|
| 713 |
+
_gc.collect()
|
| 714 |
+
except ImportError:
|
| 715 |
+
_log("[WARN] Fast captioning not available, using librosa fallback")
|
| 716 |
+
for audio_fname, full_path, sidecar_json in audio_to_caption:
|
| 717 |
+
try:
|
| 718 |
+
y_cap, sr_cap = _lr.load(full_path, sr=None, mono=True)
|
| 719 |
+
tempo_arr, _ = _lr.beat.beat_track(y=y_cap, sr=sr_cap)
|
| 720 |
+
bpm_val = int(round(float(
|
| 721 |
+
tempo_arr.item() if hasattr(tempo_arr, 'item') else tempo_arr)))
|
| 722 |
+
fallback = {"caption": audio_fname.rsplit(".", 1)[0],
|
| 723 |
+
"bpm": str(bpm_val), "key": "", "signature": "4/4",
|
| 724 |
+
"lyrics": "[Instrumental]"}
|
| 725 |
+
with open(sidecar_json, "w") as cj:
|
| 726 |
+
json.dump(fallback, cj)
|
| 727 |
+
_log(f" {audio_fname}: librosa BPM={bpm_val}")
|
| 728 |
+
except Exception as exc:
|
| 729 |
+
_log(f" {audio_fname}: failed: {exc}")
|
| 730 |
+
yield _log_text(), gr.Button(visible=False), gr.Button(visible=True), gr.File()
|
| 731 |
|
| 732 |
if _training_cancel.is_set():
|
| 733 |
_training_cancel.clear()
|
|
|
|
| 997 |
label="Rank (r)", minimum=1, maximum=128,
|
| 998 |
value=16, step=1,
|
| 999 |
)
|
| 1000 |
+
use_lm_caption = gr.Checkbox(
|
| 1001 |
+
label="Use LM captioning (best quality, ~30 min/file)",
|
| 1002 |
+
value=False,
|
| 1003 |
+
)
|
| 1004 |
|
| 1005 |
# Button swap on click (separate handler, like rvc-beatrice)
|
| 1006 |
# This fires immediately so user sees Cancel even if training
|
|
|
|
| 1013 |
# Training generator -- yields (log, train_btn, cancel_btn, output_file)
|
| 1014 |
train_event = train_btn.click(
|
| 1015 |
train_lora_ui,
|
| 1016 |
+
inputs=[train_audio, lora_name, train_epochs, train_lr, train_rank, use_lm_caption],
|
| 1017 |
outputs=[train_log, train_btn, cancel_btn, train_output_file],
|
| 1018 |
api_name="train_lora",
|
| 1019 |
concurrency_limit=1,
|