Spaces:

shethjenil
/

Spleeter

Paused

App Files Files Community

shethjenil commited on Dec 23, 2025

Commit

71e43dc

verified ·

1 Parent(s): 8370302

Update app.py

Browse files

Files changed (1) hide show

app.py +17 -29

app.py CHANGED Viewed

@@ -1,14 +1,15 @@
 import gradio as gr
 import torchaudio
-from torchaudio.transforms import Resample
 import math
 from typing import Dict, Tuple
 from huggingface_hub import hf_hub_download
-import torch
 from torch import nn, Tensor
 from torch.nn import functional as F
 from tqdm import tqdm
 from safetensors.torch import load_file
 def batchify(tensor: Tensor, T: int) -> Tensor:
     orig_size = tensor.size(-1)
     new_size = math.ceil(orig_size / T) * T
@@ -59,18 +60,13 @@ class UNet(nn.Module):
     def __init__(
         self,
         n_layers: int = 6,
-        in_channels: int = 1,
     ) -> None:
         super().__init__()
         # DownSample layers
         down_set = [in_channels] + [2 ** (i + 4) for i in range(n_layers)]
-        self.encoder_layers = nn.ModuleList(
-            [
-                EncoderBlock(in_channels=in_ch, out_channels=out_ch)
-                for in_ch, out_ch in zip(down_set[:-1], down_set[1:])
-            ]
-        )
         # UpSample layers
         up_set = [1] + [2 ** (i + 4) for i in range(n_layers)]
@@ -122,22 +118,16 @@ class UNet(nn.Module):
 class Splitter(nn.Module):
-    def __init__(self, stem_num=2):
         super(Splitter, self).__init__()
-        if stem_num == 2:
-            stem_names = ["vocals","other"]
-        if stem_num == 4:
-            stem_names = ["vocals", "drums", "bass", "other"]
-        if stem_num == 5:
-            stem_names = ["vocals", "piano", "drums", "bass", "other"]
-        # stft config
         self.F = 1024
         self.T = 512
         self.win_length = 4096
         self.hop_length = 1024
         self.win = nn.Parameter(torch.hann_window(self.win_length), requires_grad=False)
-        self.stems = nn.ModuleDict({name: UNet(in_channels=2) for name in stem_names})
-        self.load_state_dict(load_file(hf_hub_download("shethjenil/spleeter",f"{stem_num}.safetensors")))
         self.eval()
     def compute_stft(self, wav: Tensor) -> Tuple[Tensor, Tensor]:
@@ -193,7 +183,7 @@ class Splitter(nn.Module):
         return wav.detach()
     @torch.inference_mode()
-    def forward(self, wav: Tensor,batch_size=16,allow=['vocals']) -> Dict[str, Tensor]:
         # stft - 2 X F x L x 2
         # stft_mag - 2 X F x L
         stft, stft_mag = self.compute_stft(wav.squeeze())
@@ -203,7 +193,7 @@ class Splitter(nn.Module):
         stft_mag = batchify(stft_mag, self.T)  # B x 2 x F x T
         stft_mag = stft_mag.transpose(2, 3)  # B x 2 x T x F
         # compute stems' mask
-        masks = self.infer_with_batches(stft_mag,batch_size,allow)
         # compute denominator
         mask_sum = sum([m**2 for m in masks.values()])
         mask_sum += 1e-10
@@ -216,11 +206,11 @@ class Splitter(nn.Module):
             return stft_masked
         return {name: self.inverse_stft(apply_mask(m)) for name, m in masks.items()}
-    def infer_with_batches(self, stft_mag, batch_size, allow):
         masks = {name: [] for name in self.stems.keys()}
         for i in tqdm(range(0, stft_mag.shape[0], batch_size)):
             batch = stft_mag[i:i + batch_size]
-            batch_outputs = {name: net(batch) for name, net in self.stems.items() if name in allow}
             for name in batch_outputs:
                 masks[name].append(batch_outputs[name])
         return {
@@ -229,17 +219,15 @@ class Splitter(nn.Module):
             if masks[name]
         }
-def separate_audio(audio_path:str,instrument_model:int,batch_size:int,allow:list,progress=gr.Progress(True)):
-    model = Splitter(instrument_model)
     wav, sr = torchaudio.load(audio_path)
     target_sr = 44100
     if sr != target_sr:
         resampler = Resample(sr, target_sr)
         wav = resampler(wav)
         sr = target_sr
-    results = model.forward(wav,batch_size,allow)
     for i in results:
         torchaudio.save(f"{i}.mp3", results[i], sr)
-    return tuple([i+".mp3" for i in results] + [None for _ in range(5-len(results))])
-gr.Interface(separate_audio, [gr.Audio(type="filepath"),gr.Dropdown([2,4,5]),gr.Number(16),gr.Dropdown(["vocals", "piano", "drums", "bass", "other"],multiselect=True,value=["vocals", "piano", "drums", "bass", "other"])], [gr.Audio(type="filepath"), gr.Audio(type="filepath"),gr.Audio(type="filepath"),gr.Audio(type="filepath"),gr.Audio(type="filepath")]).launch()

 import gradio as gr
 import torchaudio
 import math
+import torch
+from torchaudio.transforms import Resample
 from typing import Dict, Tuple
 from huggingface_hub import hf_hub_download
 from torch import nn, Tensor
 from torch.nn import functional as F
 from tqdm import tqdm
 from safetensors.torch import load_file
 def batchify(tensor: Tensor, T: int) -> Tensor:
     orig_size = tensor.size(-1)
     new_size = math.ceil(orig_size / T) * T
     def __init__(
         self,
         n_layers: int = 6,
+        in_channels: int = 2,
     ) -> None:
         super().__init__()
         # DownSample layers
         down_set = [in_channels] + [2 ** (i + 4) for i in range(n_layers)]
+        self.encoder_layers = nn.ModuleList([EncoderBlock(in_channels=in_ch, out_channels=out_ch) for in_ch, out_ch in zip(down_set[:-1], down_set[1:])])
         # UpSample layers
         up_set = [1] + [2 ** (i + 4) for i in range(n_layers)]
 class Splitter(nn.Module):
+    def __init__(self, instrument_models):
         super(Splitter, self).__init__()
         self.F = 1024
         self.T = 512
         self.win_length = 4096
         self.hop_length = 1024
         self.win = nn.Parameter(torch.hann_window(self.win_length), requires_grad=False)
+        self.stems = nn.ModuleDict({name: UNet() for name in instrument_models})
+        for name in self.stems.keys():
+            self.stems[name].load_state_dict(load_file(hf_hub_download("shethjenil/spleeter",f"{name}.safetensors")))
         self.eval()
     def compute_stft(self, wav: Tensor) -> Tuple[Tensor, Tensor]:
         return wav.detach()
     @torch.inference_mode()
+    def forward(self, wav: Tensor,batch_size=16) -> Dict[str, Tensor]:
         # stft - 2 X F x L x 2
         # stft_mag - 2 X F x L
         stft, stft_mag = self.compute_stft(wav.squeeze())
         stft_mag = batchify(stft_mag, self.T)  # B x 2 x F x T
         stft_mag = stft_mag.transpose(2, 3)  # B x 2 x T x F
         # compute stems' mask
+        masks = self.infer_with_batches(stft_mag,batch_size)
         # compute denominator
         mask_sum = sum([m**2 for m in masks.values()])
         mask_sum += 1e-10
             return stft_masked
         return {name: self.inverse_stft(apply_mask(m)) for name, m in masks.items()}
+    def infer_with_batches(self, stft_mag, batch_size):
         masks = {name: [] for name in self.stems.keys()}
         for i in tqdm(range(0, stft_mag.shape[0], batch_size)):
             batch = stft_mag[i:i + batch_size]
+            batch_outputs = {name: net(batch) for name, net in self.stems.items()}
             for name in batch_outputs:
                 masks[name].append(batch_outputs[name])
         return {
             if masks[name]
         }
+def separate_audio(audio_path:str,batch_size:int,instrument_models:list,progress=gr.Progress(True)):
     wav, sr = torchaudio.load(audio_path)
     target_sr = 44100
     if sr != target_sr:
         resampler = Resample(sr, target_sr)
         wav = resampler(wav)
         sr = target_sr
+    results = Splitter(instrument_models).forward(wav,batch_size)
     for i in results:
         torchaudio.save(f"{i}.mp3", results[i], sr)
+    return [gr.Audio(i,type='filepath',buttons=['download']) for i in results]
+gr.Interface(separate_audio, [gr.Audio(type="filepath"),gr.Number(16),gr.Dropdown(['2_other', '2_vocals', '4_bass', '4_drums', '4_other', '4_vocals', '5_bass', '5_drums', '5_other', '5_piano', '5_vocals'],multiselect=True,value=['5_vocals'])]).launch()