Spaces:

jacob1576
/

AudioTextHTDemucs

Sleeping

App Files Files Community

jacob1576 commited on Dec 15, 2025

Commit

cffc5b3

1 Parent(s): 28e5197

Updated pip requirements and added code to load model from HF hub

Browse files

Files changed (3) hide show

app.py +10 -4
config.yaml +48 -0
requirements.txt +5 -0

app.py CHANGED Viewed

@@ -17,7 +17,8 @@ import matplotlib.pyplot as plt
 from pathlib import Path
 from demucs import pretrained
-from transformers import ClapModel, AutoTokenizer
 from src.models.stem_separation.ATHTDemucs_v2 import AudioTextHTDemucs
 from utils import load_config, plot_spectrogram
@@ -27,7 +28,6 @@ from utils import load_config, plot_spectrogram
 # ============================================================================
 cfg = load_config("config.yaml")
-CHECKPOINT_PATH     = cfg["training"]["resume_from"]  # Change as needed
 SAMPLE_RATE         = cfg["data"]["sample_rate"]
 SEGMENT_SECONDS     = cfg["data"]["segment_seconds"]
 OVERLAP             = cfg["data"]["overlap"]
@@ -41,6 +41,12 @@ else:
     DEVICE = "cpu"
 # DEVICE = "cpu"
 # ============================================================================
 # Model Loading
@@ -57,8 +63,8 @@ tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")
 print("Building AudioTextHTDemucs...")
 model = AudioTextHTDemucs(htdemucs, clap, tokenizer)
-print(f"Loading checkpoint from {CHECKPOINT_PATH}...")
-checkpoint = torch.load(CHECKPOINT_PATH, map_location="cpu")
 model.load_state_dict(checkpoint["model_state_dict"], strict=False)
 print(f"Loaded checkpoint from epoch {checkpoint.get('epoch', '?')}")

 from pathlib import Path
 from demucs import pretrained
+from transformers import ClapModel, AutoTokenizer, AutoModel
+from huggingface_hub import hf_hub_download
 from src.models.stem_separation.ATHTDemucs_v2 import AudioTextHTDemucs
 from utils import load_config, plot_spectrogram
 # ============================================================================
 cfg = load_config("config.yaml")
 SAMPLE_RATE         = cfg["data"]["sample_rate"]
 SEGMENT_SECONDS     = cfg["data"]["segment_seconds"]
 OVERLAP             = cfg["data"]["overlap"]
     DEVICE = "cpu"
 # DEVICE = "cpu"
+# Load model HuggingFace Hub
+# TODO: Add our model to the AutoModel inferface
+ckpt = hf_hub_download(
+    repo_id="jacob1576/AudioTextHTDemucs",
+    filename="best_model.pt"
+)
 # ============================================================================
 # Model Loading
 print("Building AudioTextHTDemucs...")
 model = AudioTextHTDemucs(htdemucs, clap, tokenizer)
+print(f"Loading checkpoint from HuggingFace Hub...")
+checkpoint = torch.load(ckpt, map_location="cpu")
 model.load_state_dict(checkpoint["model_state_dict"], strict=False)
 print(f"Loaded checkpoint from epoch {checkpoint.get('epoch', '?')}")

config.yaml ADDED Viewed

	@@ -0,0 +1,48 @@

+data:
+  train_dir: /home/jacob/datasets/musdb18/train       # Path to train subfolder of MUSDB18 dataset
+  test_dir: /home/jacob/datasets/musdb18/test         # Path to test subfolder of MUSDB18 dataset
+  segment_seconds: 6.0                                # Length of audio segments for training [s]
+  pct_train: 0.2                                      # Decimal percentage of full data to use for training (otherwise 1 epoch takes ~15 hrs)
+  pct_test: 0.1                                       # Decimal percentage of full data to use for testing
+  overlap: 0.1                                        # Overlap between segments for chunked inference [s]
+  sample_rate: 44100                                  # Sample rate for audio files [Hz]
+  channels: 2                                         # Number of audio channels (1 = mono, 2 = stereo)
+  random_segments: False                              # Whether to use random segments during training
+  augment: True                                       # Whether to use data augmentation (gain adjustment and channel swapping)
+model:
+  name: Audio-Text-HTDemucs                           # Model name
+  model_dim: 384                                      # Model dimension
+  text_dim: 512                                       # Text embedding dimension (laion/clap-htsat-unfused is 512)
+  num_heads: 8                                        # Number of attention heads for text cross-attention layer
+  device: cpu                                        # Device to use for training (cuda for GPU or cpu)
+  use_amp: False                                      # Whether to use automatic mixed precision (AMP) during training - WORK IN PROGRESS
+training:
+  batch_size: 8                                       # Batch size for training
+  num_workers: 0                                      # Number of DataLoader workers
+  num_epochs: 20                                      # Number of training epochs
+  optimizer:
+    name: AdamW
+    lr: 1e-4                                          # Learning rate
+    weight_decay: 1e-2                                # Weight decay for optimizer
+    grad_clip: 5.0                                    # Gradient clipping value (set to null to disable)
+  loss_weights:
+    sdr: 0.9                                          # Weight for SDR loss
+    sisdr_weight: 0.1                                 # Weight for SI-SDR loss, total loss is (sdr_weight * sdr) + (sisdr_weight * si_sdr)
+  use_L1_comb_loss: False                              # Whether to use L1 combination loss
+  L1_comb_loss:
+    sdr_weight: 1.0                                   # Weight for SDR in L1 combination loss
+    l1_weight: 0.1                                    # Weight for L1 loss in L1 combination loss
+  #resume_from: null                                   # Path to checkpoint to resume training from (set to null to train from scratch)
+  resume_from: checkpoints/2025_11_30_batch4/best_model.pt
+wandb:
+  use_wandb: False                                    # Whether to use Weights & Biases for experiment tracking
+  project: audio-text-htdemucs                        # Wandb project name
+  run_name: null
+  log_every: 50                                       # Log to wandb every N batches
+  validate_every: 1                                   # Validate every N epochs
+  save_every: 5                                       # Save model checkpoint every N epochs
+  checkpoint_dir: checkpoints/2025_12_06/             # Directory to save model checkpoints
+  output_dir: results/2025_12_06                      # Directory to save inference results

requirements.txt CHANGED Viewed

@@ -1,3 +1,5 @@
 demucs==4.0.1
 gradio==5.17.1
 huggingface_hub
@@ -6,6 +8,9 @@ librosa
 loralib
 matplotlib==3.10.1
 numpy==2.1.3
 pathlib
 pydantic==2.10.6
 soundfile==0.13.1

+--extra-index-url https://download.pytorch.org/whl/cpu
 demucs==4.0.1
 gradio==5.17.1
 huggingface_hub
 loralib
 matplotlib==3.10.1
 numpy==2.1.3
+torch==2.6.0
+torchvision==0.21.0
+torchaudio==2.6.0
 pathlib
 pydantic==2.10.6
 soundfile==0.13.1