Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .gitattributes +11 -0
- app.py +349 -0
- audiosep/__pycache__/utils.cpython-310.pyc +0 -0
- audiosep/__pycache__/utils.cpython-312.pyc +0 -0
- audiosep/config/audiosep_base.yaml +41 -0
- audiosep/models/CLAP/__init__.py +0 -0
- audiosep/models/CLAP/__pycache__/__init__.cpython-310.pyc +0 -0
- audiosep/models/CLAP/__pycache__/__init__.cpython-311.pyc +0 -0
- audiosep/models/CLAP/__pycache__/__init__.cpython-312.pyc +0 -0
- audiosep/models/CLAP/open_clip/__init__.py +25 -0
- audiosep/models/CLAP/open_clip/__pycache__/__init__.cpython-310.pyc +0 -0
- audiosep/models/CLAP/open_clip/__pycache__/__init__.cpython-311.pyc +0 -0
- audiosep/models/CLAP/open_clip/__pycache__/__init__.cpython-312.pyc +0 -0
- audiosep/models/CLAP/open_clip/__pycache__/factory.cpython-310.pyc +0 -0
- audiosep/models/CLAP/open_clip/__pycache__/factory.cpython-311.pyc +0 -0
- audiosep/models/CLAP/open_clip/__pycache__/factory.cpython-312.pyc +0 -0
- audiosep/models/CLAP/open_clip/__pycache__/feature_fusion.cpython-310.pyc +0 -0
- audiosep/models/CLAP/open_clip/__pycache__/feature_fusion.cpython-311.pyc +0 -0
- audiosep/models/CLAP/open_clip/__pycache__/feature_fusion.cpython-312.pyc +0 -0
- audiosep/models/CLAP/open_clip/__pycache__/htsat.cpython-310.pyc +0 -0
- audiosep/models/CLAP/open_clip/__pycache__/htsat.cpython-311.pyc +0 -0
- audiosep/models/CLAP/open_clip/__pycache__/htsat.cpython-312.pyc +0 -0
- audiosep/models/CLAP/open_clip/__pycache__/loss.cpython-310.pyc +0 -0
- audiosep/models/CLAP/open_clip/__pycache__/loss.cpython-311.pyc +0 -0
- audiosep/models/CLAP/open_clip/__pycache__/loss.cpython-312.pyc +0 -0
- audiosep/models/CLAP/open_clip/__pycache__/model.cpython-310.pyc +0 -0
- audiosep/models/CLAP/open_clip/__pycache__/model.cpython-311.pyc +0 -0
- audiosep/models/CLAP/open_clip/__pycache__/model.cpython-312.pyc +0 -0
- audiosep/models/CLAP/open_clip/__pycache__/openai.cpython-310.pyc +0 -0
- audiosep/models/CLAP/open_clip/__pycache__/openai.cpython-311.pyc +0 -0
- audiosep/models/CLAP/open_clip/__pycache__/openai.cpython-312.pyc +0 -0
- audiosep/models/CLAP/open_clip/__pycache__/pann_model.cpython-310.pyc +0 -0
- audiosep/models/CLAP/open_clip/__pycache__/pann_model.cpython-311.pyc +0 -0
- audiosep/models/CLAP/open_clip/__pycache__/pann_model.cpython-312.pyc +0 -0
- audiosep/models/CLAP/open_clip/__pycache__/pretrained.cpython-310.pyc +0 -0
- audiosep/models/CLAP/open_clip/__pycache__/pretrained.cpython-311.pyc +0 -0
- audiosep/models/CLAP/open_clip/__pycache__/pretrained.cpython-312.pyc +0 -0
- audiosep/models/CLAP/open_clip/__pycache__/timm_model.cpython-310.pyc +0 -0
- audiosep/models/CLAP/open_clip/__pycache__/timm_model.cpython-311.pyc +0 -0
- audiosep/models/CLAP/open_clip/__pycache__/timm_model.cpython-312.pyc +0 -0
- audiosep/models/CLAP/open_clip/__pycache__/tokenizer.cpython-310.pyc +0 -0
- audiosep/models/CLAP/open_clip/__pycache__/tokenizer.cpython-311.pyc +0 -0
- audiosep/models/CLAP/open_clip/__pycache__/tokenizer.cpython-312.pyc +0 -0
- audiosep/models/CLAP/open_clip/__pycache__/transform.cpython-310.pyc +0 -0
- audiosep/models/CLAP/open_clip/__pycache__/transform.cpython-311.pyc +0 -0
- audiosep/models/CLAP/open_clip/__pycache__/transform.cpython-312.pyc +0 -0
- audiosep/models/CLAP/open_clip/__pycache__/utils.cpython-310.pyc +0 -0
- audiosep/models/CLAP/open_clip/__pycache__/utils.cpython-311.pyc +0 -0
- audiosep/models/CLAP/open_clip/__pycache__/utils.cpython-312.pyc +0 -0
- audiosep/models/CLAP/open_clip/bert.py +40 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,14 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
examples/acoustic_guitar.wav filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
examples/laughing.wav filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
examples/ticktok_piano.wav filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
examples/water_drops.wav filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
flowsep/bigvgan/g_01000000 filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
flowsep/latent_diffusion/modules/losses/panns_distance/model/__pycache__/models.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
flowsep/latent_diffusion/modules/losses/panns_distance/model/__pycache__/models.cpython-312.pyc filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
temp_result/acoustic_guitar.wav filter=lfs diff=lfs merge=lfs -text
|
| 44 |
+
temp_result/laughing.wav filter=lfs diff=lfs merge=lfs -text
|
| 45 |
+
temp_result/mixed/acoustic_guitar.wav filter=lfs diff=lfs merge=lfs -text
|
| 46 |
+
temp_result/mixed/laughing.wav filter=lfs diff=lfs merge=lfs -text
|
app.py
ADDED
|
@@ -0,0 +1,349 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "audiosep"))
|
| 5 |
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "flowsep"))
|
| 6 |
+
|
| 7 |
+
import gradio as gr
|
| 8 |
+
import torch
|
| 9 |
+
import numpy as np
|
| 10 |
+
import torchaudio
|
| 11 |
+
import librosa
|
| 12 |
+
import yaml
|
| 13 |
+
from huggingface_hub import hf_hub_download
|
| 14 |
+
from pytorch_lightning import seed_everything
|
| 15 |
+
|
| 16 |
+
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 17 |
+
device = "cpu"
|
| 18 |
+
|
| 19 |
+
_audiosep_model = None
|
| 20 |
+
_flowsep_model = None
|
| 21 |
+
_flowsep_preprocessor = None
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class FlowSepPreprocessor:
|
| 25 |
+
def __init__(self, config):
|
| 26 |
+
import utilities.audio as Audio
|
| 27 |
+
|
| 28 |
+
self.sampling_rate = config["preprocessing"]["audio"]["sampling_rate"]
|
| 29 |
+
self.duration = config["preprocessing"]["audio"]["duration"]
|
| 30 |
+
self.hopsize = config["preprocessing"]["stft"]["hop_length"]
|
| 31 |
+
self.target_length = int(self.duration * self.sampling_rate / self.hopsize)
|
| 32 |
+
|
| 33 |
+
self.STFT = Audio.stft.TacotronSTFT(
|
| 34 |
+
config["preprocessing"]["stft"]["filter_length"],
|
| 35 |
+
config["preprocessing"]["stft"]["hop_length"],
|
| 36 |
+
config["preprocessing"]["stft"]["win_length"],
|
| 37 |
+
config["preprocessing"]["mel"]["n_mel_channels"],
|
| 38 |
+
config["preprocessing"]["audio"]["sampling_rate"],
|
| 39 |
+
config["preprocessing"]["mel"]["mel_fmin"],
|
| 40 |
+
config["preprocessing"]["mel"]["mel_fmax"],
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
def read_wav_file(self, filename):
|
| 44 |
+
waveform, sr = torchaudio.load(filename)
|
| 45 |
+
target_length = int(sr * self.duration)
|
| 46 |
+
if waveform.shape[-1] > target_length:
|
| 47 |
+
waveform = waveform[:, :target_length]
|
| 48 |
+
if sr != self.sampling_rate:
|
| 49 |
+
waveform = torchaudio.functional.resample(waveform, sr, self.sampling_rate)
|
| 50 |
+
waveform = waveform.numpy()[0, ...]
|
| 51 |
+
waveform = waveform - np.mean(waveform)
|
| 52 |
+
waveform = waveform / (np.max(np.abs(waveform)) + 1e-8)
|
| 53 |
+
waveform = waveform * 0.5
|
| 54 |
+
waveform = waveform[None, ...]
|
| 55 |
+
target_samples = int(self.sampling_rate * self.duration)
|
| 56 |
+
if waveform.shape[-1] < target_samples:
|
| 57 |
+
temp_wav = np.zeros((1, target_samples), dtype=np.float32)
|
| 58 |
+
temp_wav[:, :waveform.shape[-1]] = waveform
|
| 59 |
+
waveform = temp_wav
|
| 60 |
+
return waveform
|
| 61 |
+
|
| 62 |
+
def wav_feature_extraction(self, waveform):
|
| 63 |
+
import utilities.audio as Audio
|
| 64 |
+
|
| 65 |
+
waveform = waveform[0, ...]
|
| 66 |
+
waveform = torch.FloatTensor(waveform)
|
| 67 |
+
log_mel_spec, stft, energy = Audio.tools.get_mel_from_wav(waveform, self.STFT)
|
| 68 |
+
log_mel_spec = torch.FloatTensor(log_mel_spec.T)
|
| 69 |
+
stft = torch.FloatTensor(stft.T)
|
| 70 |
+
log_mel_spec = self._pad_spec(log_mel_spec)
|
| 71 |
+
stft = self._pad_spec(stft)
|
| 72 |
+
return log_mel_spec, stft
|
| 73 |
+
|
| 74 |
+
def _pad_spec(self, log_mel_spec):
|
| 75 |
+
n_frames = log_mel_spec.shape[0]
|
| 76 |
+
p = self.target_length - n_frames
|
| 77 |
+
if p > 0:
|
| 78 |
+
m = torch.nn.ZeroPad2d((0, 0, 0, p))
|
| 79 |
+
log_mel_spec = m(log_mel_spec)
|
| 80 |
+
elif p < 0:
|
| 81 |
+
log_mel_spec = log_mel_spec[:self.target_length, :]
|
| 82 |
+
if log_mel_spec.size(-1) % 2 != 0:
|
| 83 |
+
log_mel_spec = log_mel_spec[..., :-1]
|
| 84 |
+
return log_mel_spec
|
| 85 |
+
|
| 86 |
+
def load_full_audio(self, filename):
|
| 87 |
+
waveform, sr = torchaudio.load(filename)
|
| 88 |
+
if sr != self.sampling_rate:
|
| 89 |
+
waveform = torchaudio.functional.resample(waveform, sr, self.sampling_rate)
|
| 90 |
+
waveform = waveform.numpy()[0, ...]
|
| 91 |
+
return waveform
|
| 92 |
+
|
| 93 |
+
def preprocess_chunk(self, chunk):
|
| 94 |
+
chunk = chunk - np.mean(chunk)
|
| 95 |
+
chunk = chunk / (np.max(np.abs(chunk)) + 1e-8)
|
| 96 |
+
chunk = chunk * 0.5
|
| 97 |
+
return chunk
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
def load_audiosep():
|
| 101 |
+
global _audiosep_model
|
| 102 |
+
if _audiosep_model is not None:
|
| 103 |
+
return _audiosep_model
|
| 104 |
+
|
| 105 |
+
from models.clap_encoder import CLAP_Encoder
|
| 106 |
+
from utils import parse_yaml, load_ss_model
|
| 107 |
+
|
| 108 |
+
clap_ckpt = hf_hub_download(repo_id="bianxing77/AudioSep-hive", filename="music_speech_audioset_epoch_15_esc_89.98.pt")
|
| 109 |
+
query_encoder = CLAP_Encoder(pretrained_path=clap_ckpt).eval()
|
| 110 |
+
|
| 111 |
+
config_file = hf_hub_download(repo_id="bianxing77/AudioSep-hive", filename="config.yaml")
|
| 112 |
+
checkpoint_file = hf_hub_download(repo_id="bianxing77/AudioSep-hive", filename="audiosep_hive.ckpt")
|
| 113 |
+
configs = parse_yaml(config_file)
|
| 114 |
+
model = load_ss_model(configs=configs, checkpoint_path=checkpoint_file, query_encoder=query_encoder)
|
| 115 |
+
model = model.to(device).eval()
|
| 116 |
+
_audiosep_model = model
|
| 117 |
+
return model
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
def load_flowsep():
|
| 121 |
+
global _flowsep_model, _flowsep_preprocessor
|
| 122 |
+
if _flowsep_model is not None:
|
| 123 |
+
return _flowsep_model, _flowsep_preprocessor
|
| 124 |
+
|
| 125 |
+
seed_everything(0)
|
| 126 |
+
from latent_diffusion.util import instantiate_from_config
|
| 127 |
+
|
| 128 |
+
config_file = hf_hub_download(repo_id="bianxing77/FlowSep-hive", filename="config.yaml")
|
| 129 |
+
model_file = hf_hub_download(repo_id="bianxing77/FlowSep-hive", filename="flowsep_hive.ckpt")
|
| 130 |
+
|
| 131 |
+
configs = yaml.load(open(config_file, 'r'), Loader=yaml.FullLoader)
|
| 132 |
+
configs["model"]["params"]["first_stage_config"]["params"]["reload_from_ckpt"] = None
|
| 133 |
+
|
| 134 |
+
preprocessor = FlowSepPreprocessor(configs)
|
| 135 |
+
|
| 136 |
+
model = instantiate_from_config(configs["model"]).to(device)
|
| 137 |
+
try:
|
| 138 |
+
ckpt = torch.load(model_file, map_location=device, weights_only=False)["state_dict"]
|
| 139 |
+
except TypeError:
|
| 140 |
+
ckpt = torch.load(model_file, map_location=device)["state_dict"]
|
| 141 |
+
model.load_state_dict(ckpt, strict=True)
|
| 142 |
+
model.eval()
|
| 143 |
+
|
| 144 |
+
_flowsep_model = model
|
| 145 |
+
_flowsep_preprocessor = preprocessor
|
| 146 |
+
return model, preprocessor
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
AUDIOSEP_SR = 32000
|
| 150 |
+
FLOWSEP_CHUNK_IN = 163840
|
| 151 |
+
FLOWSEP_CHUNK_OUT = 160000
|
| 152 |
+
FLOWSEP_SR = 16000
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
def separate_audiosep(audio_path, text):
|
| 156 |
+
model = load_audiosep()
|
| 157 |
+
mixture, _ = librosa.load(audio_path, sr=AUDIOSEP_SR, mono=True)
|
| 158 |
+
input_len = mixture.shape[0]
|
| 159 |
+
|
| 160 |
+
with torch.no_grad():
|
| 161 |
+
conditions = model.query_encoder.get_query_embed(
|
| 162 |
+
modality='text', text=[text], device=device
|
| 163 |
+
)
|
| 164 |
+
input_dict = {
|
| 165 |
+
"mixture": torch.Tensor(mixture)[None, None, :].to(device),
|
| 166 |
+
"condition": conditions,
|
| 167 |
+
}
|
| 168 |
+
if input_len > AUDIOSEP_SR * 10:
|
| 169 |
+
sep_audio = model.ss_model.chunk_inference(input_dict)
|
| 170 |
+
sep_audio = sep_audio.squeeze()
|
| 171 |
+
else:
|
| 172 |
+
sep_segment = model.ss_model(input_dict)["waveform"]
|
| 173 |
+
sep_audio = sep_segment.squeeze(0).squeeze(0).data.cpu().numpy()
|
| 174 |
+
sep_audio = sep_audio[:input_len]
|
| 175 |
+
|
| 176 |
+
return (AUDIOSEP_SR, sep_audio)
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
def _flowsep_process_chunk(model, preprocessor, chunk_wav, text):
|
| 180 |
+
chunk_wav = preprocessor.preprocess_chunk(chunk_wav)
|
| 181 |
+
if len(chunk_wav) < FLOWSEP_CHUNK_IN:
|
| 182 |
+
pad = np.zeros(FLOWSEP_CHUNK_IN - len(chunk_wav), dtype=np.float32)
|
| 183 |
+
chunk_wav = np.concatenate([chunk_wav, pad])
|
| 184 |
+
chunk_wav = chunk_wav[:FLOWSEP_CHUNK_IN]
|
| 185 |
+
mixed_mel, stft = preprocessor.wav_feature_extraction(chunk_wav.reshape(1, -1))
|
| 186 |
+
batch = {
|
| 187 |
+
"fname": ["temp"],
|
| 188 |
+
"text": [text],
|
| 189 |
+
"caption": [text],
|
| 190 |
+
"waveform": torch.rand(1, 1, FLOWSEP_CHUNK_IN).to(device),
|
| 191 |
+
"log_mel_spec": torch.rand(1, 1024, 64).to(device),
|
| 192 |
+
"sampling_rate": torch.tensor([FLOWSEP_SR]).to(device),
|
| 193 |
+
"label_vector": torch.rand(1, 527).to(device),
|
| 194 |
+
"stft": torch.rand(1, 1024, 512).to(device),
|
| 195 |
+
"mixed_waveform": torch.from_numpy(chunk_wav.reshape(1, 1, FLOWSEP_CHUNK_IN)).to(device),
|
| 196 |
+
"mixed_mel": mixed_mel.reshape(1, mixed_mel.shape[0], mixed_mel.shape[1]).to(device),
|
| 197 |
+
}
|
| 198 |
+
result = model.generate_sample(
|
| 199 |
+
[batch],
|
| 200 |
+
name="temp_result",
|
| 201 |
+
unconditional_guidance_scale=1.0,
|
| 202 |
+
ddim_steps=20,
|
| 203 |
+
n_gen=1,
|
| 204 |
+
save=False,
|
| 205 |
+
save_mixed=False,
|
| 206 |
+
)
|
| 207 |
+
if isinstance(result, np.ndarray):
|
| 208 |
+
out = result.squeeze()
|
| 209 |
+
else:
|
| 210 |
+
out = result.squeeze().cpu().numpy()
|
| 211 |
+
return out[:FLOWSEP_CHUNK_OUT]
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
def separate_flowsep(audio_path, text):
|
| 215 |
+
model, preprocessor = load_flowsep()
|
| 216 |
+
full_wav = preprocessor.load_full_audio(audio_path)
|
| 217 |
+
input_len = full_wav.shape[0]
|
| 218 |
+
|
| 219 |
+
with torch.no_grad():
|
| 220 |
+
if input_len <= FLOWSEP_CHUNK_IN:
|
| 221 |
+
sep_audio = _flowsep_process_chunk(model, preprocessor, full_wav.copy(), text)
|
| 222 |
+
else:
|
| 223 |
+
out_list = []
|
| 224 |
+
start = 0
|
| 225 |
+
while start < input_len:
|
| 226 |
+
end = min(start + FLOWSEP_CHUNK_IN, input_len)
|
| 227 |
+
chunk = full_wav[start:end]
|
| 228 |
+
out_chunk = _flowsep_process_chunk(model, preprocessor, chunk.copy(), text)
|
| 229 |
+
need = min(FLOWSEP_CHUNK_OUT, input_len - start)
|
| 230 |
+
out_list.append(out_chunk[:need])
|
| 231 |
+
start += FLOWSEP_CHUNK_OUT
|
| 232 |
+
sep_audio = np.concatenate(out_list)
|
| 233 |
+
|
| 234 |
+
if len(sep_audio) > input_len:
|
| 235 |
+
sep_audio = sep_audio[:input_len]
|
| 236 |
+
elif len(sep_audio) < input_len:
|
| 237 |
+
sep_audio = np.pad(sep_audio, (0, input_len - len(sep_audio)), mode="constant", constant_values=0)
|
| 238 |
+
|
| 239 |
+
return (FLOWSEP_SR, sep_audio)
|
| 240 |
+
|
| 241 |
+
|
| 242 |
+
def inference(audio, text, model_choice):
|
| 243 |
+
if audio is None:
|
| 244 |
+
raise gr.Error("Please upload an audio file / 请上传音频文件")
|
| 245 |
+
if not text or not text.strip():
|
| 246 |
+
raise gr.Error("Please enter a text query / 请输入文本描述")
|
| 247 |
+
|
| 248 |
+
if model_choice == "AudioSep-hive":
|
| 249 |
+
return separate_audiosep(audio, text)
|
| 250 |
+
else:
|
| 251 |
+
return separate_flowsep(audio, text)
|
| 252 |
+
|
| 253 |
+
|
| 254 |
+
DESCRIPTION = """
|
| 255 |
+
# Universal Sound Separation on HIVE
|
| 256 |
+
|
| 257 |
+
**Hive** is a high-quality synthetic dataset (2k hours) built via an automated pipeline that mines high-purity single-event segments and synthesizes semantically consistent mixtures. Despite using only ~0.2% of the data scale of million-hour baselines, models trained on Hive achieve competitive separation accuracy and strong zero-shot generalization.
|
| 258 |
+
|
| 259 |
+
This space provides two separation models trained on Hive:
|
| 260 |
+
- **AudioSep**: A foundation model for open-domain sound separation with natural language queries, based on [AudioSep](https://github.com/Audio-AGI/AudioSep).
|
| 261 |
+
- **FlowSep**: A flow-matching based separation model with text conditioning, based on [FlowSep](https://github.com/Audio-AGI/FlowSep).
|
| 262 |
+
|
| 263 |
+
**How to use:**
|
| 264 |
+
1. Upload an audio file (mix of sounds)
|
| 265 |
+
2. Describe what you want to separate (e.g., "piano", "speech", "dog barking")
|
| 266 |
+
3. Select a model and click Separate
|
| 267 |
+
|
| 268 |
+
[[Paper]](https://arxiv.org/abs/2601.22599) | [[Code]](https://github.com/ShandaAI/Hive) | [[Hive Dataset]](https://huggingface.co/datasets/ShandaAI/Hive) | [[Demo Page]](https://shandaai.github.io/Hive/)
|
| 269 |
+
"""
|
| 270 |
+
|
| 271 |
+
EXAMPLES = [
|
| 272 |
+
["examples/acoustic_guitar.wav", "acoustic guitar"],
|
| 273 |
+
["examples/laughing.wav", "laughing"],
|
| 274 |
+
["examples/ticktok_piano.wav", "A ticktock sound playing at the same rhythm with piano"],
|
| 275 |
+
["examples/water_drops.wav", "water drops"],
|
| 276 |
+
["examples/noisy_speech.wav", "speech"],
|
| 277 |
+
]
|
| 278 |
+
|
| 279 |
+
with gr.Blocks(
|
| 280 |
+
theme=gr.themes.Soft(),
|
| 281 |
+
title="Universal Sound Separation on HIVE",
|
| 282 |
+
) as demo:
|
| 283 |
+
gr.Markdown(DESCRIPTION)
|
| 284 |
+
|
| 285 |
+
with gr.Row():
|
| 286 |
+
with gr.Column():
|
| 287 |
+
audio_input = gr.Audio(label="Input Mixture Audio", type="filepath")
|
| 288 |
+
text_input = gr.Textbox(
|
| 289 |
+
label="Text Query",
|
| 290 |
+
placeholder='e.g. "dog barking", "piano playing"',
|
| 291 |
+
)
|
| 292 |
+
model_choice = gr.Dropdown(
|
| 293 |
+
choices=["AudioSep-hive", "FlowSep-hive"],
|
| 294 |
+
value="AudioSep-hive",
|
| 295 |
+
label="Select Model",
|
| 296 |
+
)
|
| 297 |
+
submit_btn = gr.Button("Separate", variant="primary")
|
| 298 |
+
|
| 299 |
+
with gr.Column():
|
| 300 |
+
audio_output = gr.Audio(label="Separated Audio")
|
| 301 |
+
|
| 302 |
+
submit_btn.click(
|
| 303 |
+
fn=inference,
|
| 304 |
+
inputs=[audio_input, text_input, model_choice],
|
| 305 |
+
outputs=audio_output,
|
| 306 |
+
)
|
| 307 |
+
|
| 308 |
+
gr.Markdown("## Examples")
|
| 309 |
+
gr.Examples(examples=EXAMPLES, inputs=[audio_input, text_input])
|
| 310 |
+
|
| 311 |
+
DEBUG = False
|
| 312 |
+
|
| 313 |
+
def run_debug():
|
| 314 |
+
examples_dir = os.path.join(os.path.dirname(__file__), "examples")
|
| 315 |
+
test_path = os.path.join(examples_dir, "acoustic_guitar.wav")
|
| 316 |
+
test_text = "acoustic guitar"
|
| 317 |
+
print("\n" + "=" * 50)
|
| 318 |
+
print("[DEBUG] Starting inference test for both models")
|
| 319 |
+
print("=" * 50)
|
| 320 |
+
|
| 321 |
+
if not os.path.exists(test_path):
|
| 322 |
+
print(f"[DEBUG] Skip: {test_path} not found")
|
| 323 |
+
return
|
| 324 |
+
|
| 325 |
+
print(f"\n[DEBUG] Using test audio: {test_path}")
|
| 326 |
+
|
| 327 |
+
print("\n" + "-" * 40)
|
| 328 |
+
print("[DEBUG] AudioSep inference")
|
| 329 |
+
print("-" * 40)
|
| 330 |
+
print("[DEBUG] Loading AudioSep model...")
|
| 331 |
+
out_audiosep = separate_audiosep(test_path, test_text)
|
| 332 |
+
print(f"[DEBUG] AudioSep done. Output sr={out_audiosep[0]}, shape={np.array(out_audiosep[1]).shape}")
|
| 333 |
+
|
| 334 |
+
print("\n" + "-" * 40)
|
| 335 |
+
print("[DEBUG] FlowSep inference")
|
| 336 |
+
print("-" * 40)
|
| 337 |
+
print("[DEBUG] Loading FlowSep model...")
|
| 338 |
+
out_flowsep = separate_flowsep(test_path, test_text)
|
| 339 |
+
print(f"[DEBUG] FlowSep done. Output sr={out_flowsep[0]}, shape={np.array(out_flowsep[1]).shape}")
|
| 340 |
+
|
| 341 |
+
print("\n" + "=" * 50)
|
| 342 |
+
print("[DEBUG] Both models passed inference test")
|
| 343 |
+
print("=" * 50 + "\n")
|
| 344 |
+
|
| 345 |
+
|
| 346 |
+
if DEBUG:
|
| 347 |
+
run_debug()
|
| 348 |
+
|
| 349 |
+
demo.launch()
|
audiosep/__pycache__/utils.cpython-310.pyc
ADDED
|
Binary file (9.65 kB). View file
|
|
|
audiosep/__pycache__/utils.cpython-312.pyc
ADDED
|
Binary file (15.4 kB). View file
|
|
|
audiosep/config/audiosep_base.yaml
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
task_name: AudioSep
|
| 3 |
+
|
| 4 |
+
data:
|
| 5 |
+
datafiles:
|
| 6 |
+
- 'datafiles/template.json'
|
| 7 |
+
|
| 8 |
+
sampling_rate: 32000
|
| 9 |
+
segment_seconds: 5
|
| 10 |
+
loudness_norm:
|
| 11 |
+
lower_db: -10
|
| 12 |
+
higher_db: 10
|
| 13 |
+
max_mix_num: 2
|
| 14 |
+
|
| 15 |
+
model:
|
| 16 |
+
query_net: CLAP
|
| 17 |
+
condition_size: 512
|
| 18 |
+
model_type: ResUNet30
|
| 19 |
+
input_channels: 1
|
| 20 |
+
output_channels: 1
|
| 21 |
+
resume_checkpoint: ""
|
| 22 |
+
use_text_ratio: 1.0
|
| 23 |
+
|
| 24 |
+
train:
|
| 25 |
+
optimizer:
|
| 26 |
+
optimizer_type: AdamW
|
| 27 |
+
learning_rate: 1e-3
|
| 28 |
+
warm_up_steps: 10000
|
| 29 |
+
reduce_lr_steps: 1000000
|
| 30 |
+
lr_lambda_type: constant_warm_up
|
| 31 |
+
num_nodes: 1
|
| 32 |
+
num_workers: 6
|
| 33 |
+
loss_type: l1_wav
|
| 34 |
+
sync_batchnorm: True
|
| 35 |
+
batch_size_per_device: 12
|
| 36 |
+
steps_per_epoch: 10000 # Every 10000 steps is called an `epoch`.
|
| 37 |
+
evaluate_step_frequency: 10000 # Evaluate every #evaluate_step_frequency steps.
|
| 38 |
+
save_step_frequency: 20000 # Save every #save_step_frequency steps.
|
| 39 |
+
early_stop_steps: 10000001
|
| 40 |
+
random_seed: 1234
|
| 41 |
+
|
audiosep/models/CLAP/__init__.py
ADDED
|
File without changes
|
audiosep/models/CLAP/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (199 Bytes). View file
|
|
|
audiosep/models/CLAP/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (232 Bytes). View file
|
|
|
audiosep/models/CLAP/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (209 Bytes). View file
|
|
|
audiosep/models/CLAP/open_clip/__init__.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .factory import (
|
| 2 |
+
list_models,
|
| 3 |
+
create_model,
|
| 4 |
+
create_model_and_transforms,
|
| 5 |
+
add_model_config,
|
| 6 |
+
)
|
| 7 |
+
from .loss import ClipLoss, gather_features, LPLoss, lp_gather_features, LPMetrics
|
| 8 |
+
from .model import (
|
| 9 |
+
CLAP,
|
| 10 |
+
CLAPTextCfg,
|
| 11 |
+
CLAPVisionCfg,
|
| 12 |
+
CLAPAudioCfp,
|
| 13 |
+
convert_weights_to_fp16,
|
| 14 |
+
trace_model,
|
| 15 |
+
)
|
| 16 |
+
from .openai import load_openai_model, list_openai_models
|
| 17 |
+
from .pretrained import (
|
| 18 |
+
list_pretrained,
|
| 19 |
+
list_pretrained_tag_models,
|
| 20 |
+
list_pretrained_model_tags,
|
| 21 |
+
get_pretrained_url,
|
| 22 |
+
download_pretrained,
|
| 23 |
+
)
|
| 24 |
+
from .tokenizer import SimpleTokenizer, tokenize
|
| 25 |
+
from .transform import image_transform
|
audiosep/models/CLAP/open_clip/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (1.01 kB). View file
|
|
|
audiosep/models/CLAP/open_clip/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (1.35 kB). View file
|
|
|
audiosep/models/CLAP/open_clip/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (1.06 kB). View file
|
|
|
audiosep/models/CLAP/open_clip/__pycache__/factory.cpython-310.pyc
ADDED
|
Binary file (6.71 kB). View file
|
|
|
audiosep/models/CLAP/open_clip/__pycache__/factory.cpython-311.pyc
ADDED
|
Binary file (13.5 kB). View file
|
|
|
audiosep/models/CLAP/open_clip/__pycache__/factory.cpython-312.pyc
ADDED
|
Binary file (11.3 kB). View file
|
|
|
audiosep/models/CLAP/open_clip/__pycache__/feature_fusion.cpython-310.pyc
ADDED
|
Binary file (4.16 kB). View file
|
|
|
audiosep/models/CLAP/open_clip/__pycache__/feature_fusion.cpython-311.pyc
ADDED
|
Binary file (9.94 kB). View file
|
|
|
audiosep/models/CLAP/open_clip/__pycache__/feature_fusion.cpython-312.pyc
ADDED
|
Binary file (9.12 kB). View file
|
|
|
audiosep/models/CLAP/open_clip/__pycache__/htsat.cpython-310.pyc
ADDED
|
Binary file (30.8 kB). View file
|
|
|
audiosep/models/CLAP/open_clip/__pycache__/htsat.cpython-311.pyc
ADDED
|
Binary file (57.8 kB). View file
|
|
|
audiosep/models/CLAP/open_clip/__pycache__/htsat.cpython-312.pyc
ADDED
|
Binary file (54.1 kB). View file
|
|
|
audiosep/models/CLAP/open_clip/__pycache__/loss.cpython-310.pyc
ADDED
|
Binary file (8.01 kB). View file
|
|
|
audiosep/models/CLAP/open_clip/__pycache__/loss.cpython-311.pyc
ADDED
|
Binary file (17.8 kB). View file
|
|
|
audiosep/models/CLAP/open_clip/__pycache__/loss.cpython-312.pyc
ADDED
|
Binary file (16.1 kB). View file
|
|
|
audiosep/models/CLAP/open_clip/__pycache__/model.cpython-310.pyc
ADDED
|
Binary file (24.2 kB). View file
|
|
|
audiosep/models/CLAP/open_clip/__pycache__/model.cpython-311.pyc
ADDED
|
Binary file (48.2 kB). View file
|
|
|
audiosep/models/CLAP/open_clip/__pycache__/model.cpython-312.pyc
ADDED
|
Binary file (45.4 kB). View file
|
|
|
audiosep/models/CLAP/open_clip/__pycache__/openai.cpython-310.pyc
ADDED
|
Binary file (4.56 kB). View file
|
|
|
audiosep/models/CLAP/open_clip/__pycache__/openai.cpython-311.pyc
ADDED
|
Binary file (8.46 kB). View file
|
|
|
audiosep/models/CLAP/open_clip/__pycache__/openai.cpython-312.pyc
ADDED
|
Binary file (7.38 kB). View file
|
|
|
audiosep/models/CLAP/open_clip/__pycache__/pann_model.cpython-310.pyc
ADDED
|
Binary file (13.1 kB). View file
|
|
|
audiosep/models/CLAP/open_clip/__pycache__/pann_model.cpython-311.pyc
ADDED
|
Binary file (30 kB). View file
|
|
|
audiosep/models/CLAP/open_clip/__pycache__/pann_model.cpython-312.pyc
ADDED
|
Binary file (27.2 kB). View file
|
|
|
audiosep/models/CLAP/open_clip/__pycache__/pretrained.cpython-310.pyc
ADDED
|
Binary file (5.08 kB). View file
|
|
|
audiosep/models/CLAP/open_clip/__pycache__/pretrained.cpython-311.pyc
ADDED
|
Binary file (8.33 kB). View file
|
|
|
audiosep/models/CLAP/open_clip/__pycache__/pretrained.cpython-312.pyc
ADDED
|
Binary file (7.14 kB). View file
|
|
|
audiosep/models/CLAP/open_clip/__pycache__/timm_model.cpython-310.pyc
ADDED
|
Binary file (3.48 kB). View file
|
|
|
audiosep/models/CLAP/open_clip/__pycache__/timm_model.cpython-311.pyc
ADDED
|
Binary file (5.82 kB). View file
|
|
|
audiosep/models/CLAP/open_clip/__pycache__/timm_model.cpython-312.pyc
ADDED
|
Binary file (5.05 kB). View file
|
|
|
audiosep/models/CLAP/open_clip/__pycache__/tokenizer.cpython-310.pyc
ADDED
|
Binary file (7.4 kB). View file
|
|
|
audiosep/models/CLAP/open_clip/__pycache__/tokenizer.cpython-311.pyc
ADDED
|
Binary file (13.9 kB). View file
|
|
|
audiosep/models/CLAP/open_clip/__pycache__/tokenizer.cpython-312.pyc
ADDED
|
Binary file (11.1 kB). View file
|
|
|
audiosep/models/CLAP/open_clip/__pycache__/transform.cpython-310.pyc
ADDED
|
Binary file (1.02 kB). View file
|
|
|
audiosep/models/CLAP/open_clip/__pycache__/transform.cpython-311.pyc
ADDED
|
Binary file (1.6 kB). View file
|
|
|
audiosep/models/CLAP/open_clip/__pycache__/transform.cpython-312.pyc
ADDED
|
Binary file (1.36 kB). View file
|
|
|
audiosep/models/CLAP/open_clip/__pycache__/utils.cpython-310.pyc
ADDED
|
Binary file (10.5 kB). View file
|
|
|
audiosep/models/CLAP/open_clip/__pycache__/utils.cpython-311.pyc
ADDED
|
Binary file (19.9 kB). View file
|
|
|
audiosep/models/CLAP/open_clip/__pycache__/utils.cpython-312.pyc
ADDED
|
Binary file (16.8 kB). View file
|
|
|
audiosep/models/CLAP/open_clip/bert.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import BertTokenizer, BertModel
|
| 2 |
+
|
| 3 |
+
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
|
| 4 |
+
model = BertModel.from_pretrained("bert-base-uncased")
|
| 5 |
+
text = "Replace me by any text you'd like."
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def bert_embeddings(text):
|
| 9 |
+
# text = "Replace me by any text you'd like."
|
| 10 |
+
encoded_input = tokenizer(text, return_tensors="pt")
|
| 11 |
+
output = model(**encoded_input)
|
| 12 |
+
return output
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
from transformers import RobertaTokenizer, RobertaModel
|
| 16 |
+
|
| 17 |
+
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
|
| 18 |
+
model = RobertaModel.from_pretrained("roberta-base")
|
| 19 |
+
text = "Replace me by any text you'd like."
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def Roberta_embeddings(text):
|
| 23 |
+
# text = "Replace me by any text you'd like."
|
| 24 |
+
encoded_input = tokenizer(text, return_tensors="pt")
|
| 25 |
+
output = model(**encoded_input)
|
| 26 |
+
return output
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
from transformers import BartTokenizer, BartModel
|
| 30 |
+
|
| 31 |
+
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
|
| 32 |
+
model = BartModel.from_pretrained("facebook/bart-base")
|
| 33 |
+
text = "Replace me by any text you'd like."
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def bart_embeddings(text):
|
| 37 |
+
# text = "Replace me by any text you'd like."
|
| 38 |
+
encoded_input = tokenizer(text, return_tensors="pt")
|
| 39 |
+
output = model(**encoded_input)
|
| 40 |
+
return output
|