music-separator-space

Sleeping

App Files Files Community

theadityamittal commited on Jun 18, 2025

Commit

acf8aa4

1 Parent(s): 3b79f8c

Initial Space with model download + Gradio demo

Browse files

Files changed (8) hide show

.DS_Store +0 -0
config/default.yaml +50 -0
requirements.txt +11 -0
serve.py +108 -0
src/.DS_Store +0 -0
src/__init__.py +0 -0
src/models/__init__.py +0 -0
src/models/unet.py +92 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

config/default.yaml ADDED Viewed

	@@ -0,0 +1,50 @@

+# config/default.yaml
+device: "mps"
+data:
+  raw_path: data/raw
+  splits: ["train", "test"]
+  processed_path: data/processed
+  sample_rate: 16000
+  n_fft: 1024
+  hop_length: 512
+  n_mels: 80
+  segment_length: 256
+  # for DataLoader
+  batch_size: 16
+  num_workers: 4
+  # list of all sources (including mixture)
+  sources: ["mixture", "drums", "bass", "other", "vocals"]
+model:
+  checkpoint_dir: models/checkpoints
+  # for UNet
+  chans: 32
+  num_pool_layers: 4
+training:
+  # for training loop
+  epochs: 50
+  lr: 1e-4
+  max_steps: null
+  log_interval: 50    # how many batches between progress logs
+augment:
+  # defaults for your SpectrogramTransforms
+  time_mask_param: 30
+  freq_mask_param: 15
+  time_warp_param: 40
+  stripe_time_width: 1
+  stripe_freq_width: 1
+  stripe_time_count: 2
+  stripe_freq_count: 2
+  noise_std: 0.01
+experiment:
+  # MLflow experiment metadata
+  name: default_experiment
+  run_name: run1

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+torch
+librosa
+fastapi
+uvicorn
+mlflow
+dvc
+pytest
+gradio
+soundfile
+huggingface_hub
+omegaconf

serve.py ADDED Viewed

	@@ -0,0 +1,108 @@

+# serve.py
+import os
+import tempfile
+import numpy as np
+import torch
+import librosa
+import soundfile as sf
+import gradio as gr
+from omegaconf import OmegaConf
+from huggingface_hub import hf_hub_download
+from src.models.unet import UNet
+# 1) Load your config and model once at startup
+CFG = OmegaConf.load("config/default.yaml")
+DEVICE = torch.device("mps" if torch.mps else "cpu")
+MODEL = UNet(
+    in_ch=1,
+    num_sources=len(CFG.data.sources) - 1,
+    chans=CFG.model.chans,
+    num_pool_layers=CFG.model.num_pool_layers
+).to(DEVICE)
+# point this at your best checkpoint in the Space
+ckpt_file = hf_hub_download(
+    repo_id="theadityamittal/music-separator-unet",
+    filename="checkpoints/unet_best.pt"
+)
+MODEL.load_state_dict(torch.load(ckpt_file, map_location=DEVICE))
+MODEL.eval()
+def separate_file(mix_path):
+    """
+    Given a file path to the uploaded mixture WAV, returns
+    a dict of { "drums": path, "bass": path, ... } to the separated .wav files.
+    """
+    # 1. Load audio & STFT
+    wav, sr = librosa.load(mix_path, sr=CFG.data.sample_rate, mono=True)
+    stft = librosa.stft(
+        wav, n_fft=CFG.data.n_fft, hop_length=CFG.data.hop_length
+    )
+    mag, phase = np.abs(stft), np.angle(stft)
+    F, T = mag.shape
+    # 2. Pad to multiple of segment_length
+    SEG = CFG.data.segment_length
+    pad = (SEG - (T % SEG)) % SEG
+    if pad:
+        mag   = np.pad(mag,   ((0,0),(0,pad)), constant_values=0)
+        phase = np.pad(phase, ((0,0),(0,pad)), constant_values=0)
+    n_seg = mag.shape[1] // SEG
+    # 3. Inference in chunks
+    preds = []
+    with torch.no_grad():
+        for i in range(n_seg):
+            mseg = mag[:, i*SEG:(i+1)*SEG]
+            x = torch.from_numpy(mseg).unsqueeze(0).unsqueeze(0).to(DEVICE).float()
+            y = MODEL(x)  # (1, S, F, SEG)
+            preds.append(y.squeeze(0).cpu().numpy())
+    pred_mag = np.concatenate(preds, axis=2)[:, :, :T]
+    phase    = phase[:, :T]
+    # 4. Reconstruct waveforms and write to temp files
+    out_paths = {}
+    for idx, src in enumerate(CFG.data.sources[1:]):
+        spec = pred_mag[idx] * np.exp(1j * phase)
+        est  = librosa.istft(
+            spec,
+            hop_length=CFG.data.hop_length,
+            win_length=CFG.data.n_fft
+        )
+        # write to a temp WAV file
+        fd, path = tempfile.mkstemp(suffix=f"_{src}.wav")
+        os.close(fd)
+        sf.write(path, est, sr)
+        out_paths[src] = path
+    # return in the order drums, bass, other, vocals
+    return [out_paths[src] for src in CFG.data.sources[1:]]
+# 5) Build Gradio interface
+description = """
+## Music Source Separation
+Upload a mix `.wav` and get back **drums**, **bass**, **other**, and **vocals** stems separated by a U-Net model.
+"""
+iface = gr.Interface(
+    fn=separate_file,
+    inputs=gr.Audio(label="Mixture (.wav)", type="filepath"),
+    outputs=[
+         gr.Audio(label="Drums",  type="filepath"),
+         gr.Audio(label="Bass",   type="filepath"),
+         gr.Audio(label="Other",  type="filepath"),
+         gr.Audio(label="Vocals", type="filepath"),
+     ],
+    title="U-Net Music Separator",
+    description=description,
+    allow_flagging="never",
+)
+if __name__ == "__main__":
+    iface.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860)), share=True)

src/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

src/__init__.py ADDED Viewed

File without changes

src/models/__init__.py ADDED Viewed

File without changes

src/models/unet.py ADDED Viewed

	@@ -0,0 +1,92 @@

+# src/models/unet.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class ConvBlock(nn.Module):
+    def __init__(self, in_ch: int, out_ch: int):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Conv2d(in_ch, out_ch, kernel_size=3, padding=1),
+            nn.BatchNorm2d(out_ch),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(out_ch, out_ch, kernel_size=3, padding=1),
+            nn.BatchNorm2d(out_ch),
+            nn.ReLU(inplace=True),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.net(x)
+class DownBlock(nn.Module):
+    def __init__(self, in_ch: int, out_ch: int):
+        super().__init__()
+        self.conv = ConvBlock(in_ch, out_ch)
+        self.pool = nn.MaxPool2d(kernel_size=2)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.conv(x)
+        return self.pool(x)
+class UpBlock(nn.Module):
+    def __init__(self, in_ch: int, out_ch: int):
+        super().__init__()
+        self.upconv = nn.ConvTranspose2d(in_ch, out_ch, kernel_size=2, stride=2)
+        self.conv   = ConvBlock(in_ch, out_ch)
+    def forward(self, x: torch.Tensor, skip: torch.Tensor) -> torch.Tensor:
+        x = self.upconv(x)
+        # pad if needed
+        if x.shape != skip.shape:
+            diffY = skip.size(2) - x.size(2)
+            diffX = skip.size(3) - x.size(3)
+            x = F.pad(x, [diffX//2, diffX-diffX//2, diffY//2, diffY-diffY//2])
+        x = torch.cat([skip, x], dim=1)
+        return self.conv(x)
+class UNet(nn.Module):
+    def __init__(self,
+                 in_ch: int          = 1,
+                 num_sources: int    = 4,
+                 chans: int          = 32,
+                 num_pool_layers: int= 4):
+        super().__init__()
+        # --- Encoder ---
+        self.down_blocks = nn.ModuleList()
+        ch = chans
+        # first layer
+        self.down_blocks.append(ConvBlock(in_ch, ch))
+        # further downsampling
+        for _ in range(1, num_pool_layers):
+            self.down_blocks.append(DownBlock(ch, ch*2))
+            ch *= 2
+        # --- Bottleneck ---
+        self.bottleneck = ConvBlock(ch, ch*2)
+        ch *= 2  # now channel count matches bottleneck output
+        # --- Decoder ---
+        self.up_blocks = nn.ModuleList()
+        for _ in range(num_pool_layers):
+            # in_ch = two times the skip channels
+            self.up_blocks.append(UpBlock(ch, ch//2))
+            ch //= 2
+        # ch now equals the number of channels output by the last UpBlock
+        # --- Final conv ---
+        self.final_conv = nn.Conv2d(ch, num_sources, kernel_size=1)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        skips = []
+        for down in self.down_blocks:
+            x = down(x)
+            skips.append(x)
+        x = self.bottleneck(x)
+        for up, skip in zip(self.up_blocks, reversed(skips)):
+            x = up(x, skip)
+        return self.final_conv(x)