bianxing77 commited on
Commit
1dad4d3
·
verified ·
1 Parent(s): 1d887a7

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +11 -0
  2. app.py +349 -0
  3. audiosep/__pycache__/utils.cpython-310.pyc +0 -0
  4. audiosep/__pycache__/utils.cpython-312.pyc +0 -0
  5. audiosep/config/audiosep_base.yaml +41 -0
  6. audiosep/models/CLAP/__init__.py +0 -0
  7. audiosep/models/CLAP/__pycache__/__init__.cpython-310.pyc +0 -0
  8. audiosep/models/CLAP/__pycache__/__init__.cpython-311.pyc +0 -0
  9. audiosep/models/CLAP/__pycache__/__init__.cpython-312.pyc +0 -0
  10. audiosep/models/CLAP/open_clip/__init__.py +25 -0
  11. audiosep/models/CLAP/open_clip/__pycache__/__init__.cpython-310.pyc +0 -0
  12. audiosep/models/CLAP/open_clip/__pycache__/__init__.cpython-311.pyc +0 -0
  13. audiosep/models/CLAP/open_clip/__pycache__/__init__.cpython-312.pyc +0 -0
  14. audiosep/models/CLAP/open_clip/__pycache__/factory.cpython-310.pyc +0 -0
  15. audiosep/models/CLAP/open_clip/__pycache__/factory.cpython-311.pyc +0 -0
  16. audiosep/models/CLAP/open_clip/__pycache__/factory.cpython-312.pyc +0 -0
  17. audiosep/models/CLAP/open_clip/__pycache__/feature_fusion.cpython-310.pyc +0 -0
  18. audiosep/models/CLAP/open_clip/__pycache__/feature_fusion.cpython-311.pyc +0 -0
  19. audiosep/models/CLAP/open_clip/__pycache__/feature_fusion.cpython-312.pyc +0 -0
  20. audiosep/models/CLAP/open_clip/__pycache__/htsat.cpython-310.pyc +0 -0
  21. audiosep/models/CLAP/open_clip/__pycache__/htsat.cpython-311.pyc +0 -0
  22. audiosep/models/CLAP/open_clip/__pycache__/htsat.cpython-312.pyc +0 -0
  23. audiosep/models/CLAP/open_clip/__pycache__/loss.cpython-310.pyc +0 -0
  24. audiosep/models/CLAP/open_clip/__pycache__/loss.cpython-311.pyc +0 -0
  25. audiosep/models/CLAP/open_clip/__pycache__/loss.cpython-312.pyc +0 -0
  26. audiosep/models/CLAP/open_clip/__pycache__/model.cpython-310.pyc +0 -0
  27. audiosep/models/CLAP/open_clip/__pycache__/model.cpython-311.pyc +0 -0
  28. audiosep/models/CLAP/open_clip/__pycache__/model.cpython-312.pyc +0 -0
  29. audiosep/models/CLAP/open_clip/__pycache__/openai.cpython-310.pyc +0 -0
  30. audiosep/models/CLAP/open_clip/__pycache__/openai.cpython-311.pyc +0 -0
  31. audiosep/models/CLAP/open_clip/__pycache__/openai.cpython-312.pyc +0 -0
  32. audiosep/models/CLAP/open_clip/__pycache__/pann_model.cpython-310.pyc +0 -0
  33. audiosep/models/CLAP/open_clip/__pycache__/pann_model.cpython-311.pyc +0 -0
  34. audiosep/models/CLAP/open_clip/__pycache__/pann_model.cpython-312.pyc +0 -0
  35. audiosep/models/CLAP/open_clip/__pycache__/pretrained.cpython-310.pyc +0 -0
  36. audiosep/models/CLAP/open_clip/__pycache__/pretrained.cpython-311.pyc +0 -0
  37. audiosep/models/CLAP/open_clip/__pycache__/pretrained.cpython-312.pyc +0 -0
  38. audiosep/models/CLAP/open_clip/__pycache__/timm_model.cpython-310.pyc +0 -0
  39. audiosep/models/CLAP/open_clip/__pycache__/timm_model.cpython-311.pyc +0 -0
  40. audiosep/models/CLAP/open_clip/__pycache__/timm_model.cpython-312.pyc +0 -0
  41. audiosep/models/CLAP/open_clip/__pycache__/tokenizer.cpython-310.pyc +0 -0
  42. audiosep/models/CLAP/open_clip/__pycache__/tokenizer.cpython-311.pyc +0 -0
  43. audiosep/models/CLAP/open_clip/__pycache__/tokenizer.cpython-312.pyc +0 -0
  44. audiosep/models/CLAP/open_clip/__pycache__/transform.cpython-310.pyc +0 -0
  45. audiosep/models/CLAP/open_clip/__pycache__/transform.cpython-311.pyc +0 -0
  46. audiosep/models/CLAP/open_clip/__pycache__/transform.cpython-312.pyc +0 -0
  47. audiosep/models/CLAP/open_clip/__pycache__/utils.cpython-310.pyc +0 -0
  48. audiosep/models/CLAP/open_clip/__pycache__/utils.cpython-311.pyc +0 -0
  49. audiosep/models/CLAP/open_clip/__pycache__/utils.cpython-312.pyc +0 -0
  50. audiosep/models/CLAP/open_clip/bert.py +40 -0
.gitattributes CHANGED
@@ -33,3 +33,14 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ examples/acoustic_guitar.wav filter=lfs diff=lfs merge=lfs -text
37
+ examples/laughing.wav filter=lfs diff=lfs merge=lfs -text
38
+ examples/ticktok_piano.wav filter=lfs diff=lfs merge=lfs -text
39
+ examples/water_drops.wav filter=lfs diff=lfs merge=lfs -text
40
+ flowsep/bigvgan/g_01000000 filter=lfs diff=lfs merge=lfs -text
41
+ flowsep/latent_diffusion/modules/losses/panns_distance/model/__pycache__/models.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
42
+ flowsep/latent_diffusion/modules/losses/panns_distance/model/__pycache__/models.cpython-312.pyc filter=lfs diff=lfs merge=lfs -text
43
+ temp_result/acoustic_guitar.wav filter=lfs diff=lfs merge=lfs -text
44
+ temp_result/laughing.wav filter=lfs diff=lfs merge=lfs -text
45
+ temp_result/mixed/acoustic_guitar.wav filter=lfs diff=lfs merge=lfs -text
46
+ temp_result/mixed/laughing.wav filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,349 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+
4
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), "audiosep"))
5
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), "flowsep"))
6
+
7
+ import gradio as gr
8
+ import torch
9
+ import numpy as np
10
+ import torchaudio
11
+ import librosa
12
+ import yaml
13
+ from huggingface_hub import hf_hub_download
14
+ from pytorch_lightning import seed_everything
15
+
16
+ # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
17
+ device = "cpu"
18
+
19
+ _audiosep_model = None
20
+ _flowsep_model = None
21
+ _flowsep_preprocessor = None
22
+
23
+
24
+ class FlowSepPreprocessor:
25
+ def __init__(self, config):
26
+ import utilities.audio as Audio
27
+
28
+ self.sampling_rate = config["preprocessing"]["audio"]["sampling_rate"]
29
+ self.duration = config["preprocessing"]["audio"]["duration"]
30
+ self.hopsize = config["preprocessing"]["stft"]["hop_length"]
31
+ self.target_length = int(self.duration * self.sampling_rate / self.hopsize)
32
+
33
+ self.STFT = Audio.stft.TacotronSTFT(
34
+ config["preprocessing"]["stft"]["filter_length"],
35
+ config["preprocessing"]["stft"]["hop_length"],
36
+ config["preprocessing"]["stft"]["win_length"],
37
+ config["preprocessing"]["mel"]["n_mel_channels"],
38
+ config["preprocessing"]["audio"]["sampling_rate"],
39
+ config["preprocessing"]["mel"]["mel_fmin"],
40
+ config["preprocessing"]["mel"]["mel_fmax"],
41
+ )
42
+
43
+ def read_wav_file(self, filename):
44
+ waveform, sr = torchaudio.load(filename)
45
+ target_length = int(sr * self.duration)
46
+ if waveform.shape[-1] > target_length:
47
+ waveform = waveform[:, :target_length]
48
+ if sr != self.sampling_rate:
49
+ waveform = torchaudio.functional.resample(waveform, sr, self.sampling_rate)
50
+ waveform = waveform.numpy()[0, ...]
51
+ waveform = waveform - np.mean(waveform)
52
+ waveform = waveform / (np.max(np.abs(waveform)) + 1e-8)
53
+ waveform = waveform * 0.5
54
+ waveform = waveform[None, ...]
55
+ target_samples = int(self.sampling_rate * self.duration)
56
+ if waveform.shape[-1] < target_samples:
57
+ temp_wav = np.zeros((1, target_samples), dtype=np.float32)
58
+ temp_wav[:, :waveform.shape[-1]] = waveform
59
+ waveform = temp_wav
60
+ return waveform
61
+
62
+ def wav_feature_extraction(self, waveform):
63
+ import utilities.audio as Audio
64
+
65
+ waveform = waveform[0, ...]
66
+ waveform = torch.FloatTensor(waveform)
67
+ log_mel_spec, stft, energy = Audio.tools.get_mel_from_wav(waveform, self.STFT)
68
+ log_mel_spec = torch.FloatTensor(log_mel_spec.T)
69
+ stft = torch.FloatTensor(stft.T)
70
+ log_mel_spec = self._pad_spec(log_mel_spec)
71
+ stft = self._pad_spec(stft)
72
+ return log_mel_spec, stft
73
+
74
+ def _pad_spec(self, log_mel_spec):
75
+ n_frames = log_mel_spec.shape[0]
76
+ p = self.target_length - n_frames
77
+ if p > 0:
78
+ m = torch.nn.ZeroPad2d((0, 0, 0, p))
79
+ log_mel_spec = m(log_mel_spec)
80
+ elif p < 0:
81
+ log_mel_spec = log_mel_spec[:self.target_length, :]
82
+ if log_mel_spec.size(-1) % 2 != 0:
83
+ log_mel_spec = log_mel_spec[..., :-1]
84
+ return log_mel_spec
85
+
86
+ def load_full_audio(self, filename):
87
+ waveform, sr = torchaudio.load(filename)
88
+ if sr != self.sampling_rate:
89
+ waveform = torchaudio.functional.resample(waveform, sr, self.sampling_rate)
90
+ waveform = waveform.numpy()[0, ...]
91
+ return waveform
92
+
93
+ def preprocess_chunk(self, chunk):
94
+ chunk = chunk - np.mean(chunk)
95
+ chunk = chunk / (np.max(np.abs(chunk)) + 1e-8)
96
+ chunk = chunk * 0.5
97
+ return chunk
98
+
99
+
100
+ def load_audiosep():
101
+ global _audiosep_model
102
+ if _audiosep_model is not None:
103
+ return _audiosep_model
104
+
105
+ from models.clap_encoder import CLAP_Encoder
106
+ from utils import parse_yaml, load_ss_model
107
+
108
+ clap_ckpt = hf_hub_download(repo_id="bianxing77/AudioSep-hive", filename="music_speech_audioset_epoch_15_esc_89.98.pt")
109
+ query_encoder = CLAP_Encoder(pretrained_path=clap_ckpt).eval()
110
+
111
+ config_file = hf_hub_download(repo_id="bianxing77/AudioSep-hive", filename="config.yaml")
112
+ checkpoint_file = hf_hub_download(repo_id="bianxing77/AudioSep-hive", filename="audiosep_hive.ckpt")
113
+ configs = parse_yaml(config_file)
114
+ model = load_ss_model(configs=configs, checkpoint_path=checkpoint_file, query_encoder=query_encoder)
115
+ model = model.to(device).eval()
116
+ _audiosep_model = model
117
+ return model
118
+
119
+
120
+ def load_flowsep():
121
+ global _flowsep_model, _flowsep_preprocessor
122
+ if _flowsep_model is not None:
123
+ return _flowsep_model, _flowsep_preprocessor
124
+
125
+ seed_everything(0)
126
+ from latent_diffusion.util import instantiate_from_config
127
+
128
+ config_file = hf_hub_download(repo_id="bianxing77/FlowSep-hive", filename="config.yaml")
129
+ model_file = hf_hub_download(repo_id="bianxing77/FlowSep-hive", filename="flowsep_hive.ckpt")
130
+
131
+ configs = yaml.load(open(config_file, 'r'), Loader=yaml.FullLoader)
132
+ configs["model"]["params"]["first_stage_config"]["params"]["reload_from_ckpt"] = None
133
+
134
+ preprocessor = FlowSepPreprocessor(configs)
135
+
136
+ model = instantiate_from_config(configs["model"]).to(device)
137
+ try:
138
+ ckpt = torch.load(model_file, map_location=device, weights_only=False)["state_dict"]
139
+ except TypeError:
140
+ ckpt = torch.load(model_file, map_location=device)["state_dict"]
141
+ model.load_state_dict(ckpt, strict=True)
142
+ model.eval()
143
+
144
+ _flowsep_model = model
145
+ _flowsep_preprocessor = preprocessor
146
+ return model, preprocessor
147
+
148
+
149
+ AUDIOSEP_SR = 32000
150
+ FLOWSEP_CHUNK_IN = 163840
151
+ FLOWSEP_CHUNK_OUT = 160000
152
+ FLOWSEP_SR = 16000
153
+
154
+
155
+ def separate_audiosep(audio_path, text):
156
+ model = load_audiosep()
157
+ mixture, _ = librosa.load(audio_path, sr=AUDIOSEP_SR, mono=True)
158
+ input_len = mixture.shape[0]
159
+
160
+ with torch.no_grad():
161
+ conditions = model.query_encoder.get_query_embed(
162
+ modality='text', text=[text], device=device
163
+ )
164
+ input_dict = {
165
+ "mixture": torch.Tensor(mixture)[None, None, :].to(device),
166
+ "condition": conditions,
167
+ }
168
+ if input_len > AUDIOSEP_SR * 10:
169
+ sep_audio = model.ss_model.chunk_inference(input_dict)
170
+ sep_audio = sep_audio.squeeze()
171
+ else:
172
+ sep_segment = model.ss_model(input_dict)["waveform"]
173
+ sep_audio = sep_segment.squeeze(0).squeeze(0).data.cpu().numpy()
174
+ sep_audio = sep_audio[:input_len]
175
+
176
+ return (AUDIOSEP_SR, sep_audio)
177
+
178
+
179
+ def _flowsep_process_chunk(model, preprocessor, chunk_wav, text):
180
+ chunk_wav = preprocessor.preprocess_chunk(chunk_wav)
181
+ if len(chunk_wav) < FLOWSEP_CHUNK_IN:
182
+ pad = np.zeros(FLOWSEP_CHUNK_IN - len(chunk_wav), dtype=np.float32)
183
+ chunk_wav = np.concatenate([chunk_wav, pad])
184
+ chunk_wav = chunk_wav[:FLOWSEP_CHUNK_IN]
185
+ mixed_mel, stft = preprocessor.wav_feature_extraction(chunk_wav.reshape(1, -1))
186
+ batch = {
187
+ "fname": ["temp"],
188
+ "text": [text],
189
+ "caption": [text],
190
+ "waveform": torch.rand(1, 1, FLOWSEP_CHUNK_IN).to(device),
191
+ "log_mel_spec": torch.rand(1, 1024, 64).to(device),
192
+ "sampling_rate": torch.tensor([FLOWSEP_SR]).to(device),
193
+ "label_vector": torch.rand(1, 527).to(device),
194
+ "stft": torch.rand(1, 1024, 512).to(device),
195
+ "mixed_waveform": torch.from_numpy(chunk_wav.reshape(1, 1, FLOWSEP_CHUNK_IN)).to(device),
196
+ "mixed_mel": mixed_mel.reshape(1, mixed_mel.shape[0], mixed_mel.shape[1]).to(device),
197
+ }
198
+ result = model.generate_sample(
199
+ [batch],
200
+ name="temp_result",
201
+ unconditional_guidance_scale=1.0,
202
+ ddim_steps=20,
203
+ n_gen=1,
204
+ save=False,
205
+ save_mixed=False,
206
+ )
207
+ if isinstance(result, np.ndarray):
208
+ out = result.squeeze()
209
+ else:
210
+ out = result.squeeze().cpu().numpy()
211
+ return out[:FLOWSEP_CHUNK_OUT]
212
+
213
+
214
+ def separate_flowsep(audio_path, text):
215
+ model, preprocessor = load_flowsep()
216
+ full_wav = preprocessor.load_full_audio(audio_path)
217
+ input_len = full_wav.shape[0]
218
+
219
+ with torch.no_grad():
220
+ if input_len <= FLOWSEP_CHUNK_IN:
221
+ sep_audio = _flowsep_process_chunk(model, preprocessor, full_wav.copy(), text)
222
+ else:
223
+ out_list = []
224
+ start = 0
225
+ while start < input_len:
226
+ end = min(start + FLOWSEP_CHUNK_IN, input_len)
227
+ chunk = full_wav[start:end]
228
+ out_chunk = _flowsep_process_chunk(model, preprocessor, chunk.copy(), text)
229
+ need = min(FLOWSEP_CHUNK_OUT, input_len - start)
230
+ out_list.append(out_chunk[:need])
231
+ start += FLOWSEP_CHUNK_OUT
232
+ sep_audio = np.concatenate(out_list)
233
+
234
+ if len(sep_audio) > input_len:
235
+ sep_audio = sep_audio[:input_len]
236
+ elif len(sep_audio) < input_len:
237
+ sep_audio = np.pad(sep_audio, (0, input_len - len(sep_audio)), mode="constant", constant_values=0)
238
+
239
+ return (FLOWSEP_SR, sep_audio)
240
+
241
+
242
+ def inference(audio, text, model_choice):
243
+ if audio is None:
244
+ raise gr.Error("Please upload an audio file / 请上传音频文件")
245
+ if not text or not text.strip():
246
+ raise gr.Error("Please enter a text query / 请输入文本描述")
247
+
248
+ if model_choice == "AudioSep-hive":
249
+ return separate_audiosep(audio, text)
250
+ else:
251
+ return separate_flowsep(audio, text)
252
+
253
+
254
+ DESCRIPTION = """
255
+ # Universal Sound Separation on HIVE
256
+
257
+ **Hive** is a high-quality synthetic dataset (2k hours) built via an automated pipeline that mines high-purity single-event segments and synthesizes semantically consistent mixtures. Despite using only ~0.2% of the data scale of million-hour baselines, models trained on Hive achieve competitive separation accuracy and strong zero-shot generalization.
258
+
259
+ This space provides two separation models trained on Hive:
260
+ - **AudioSep**: A foundation model for open-domain sound separation with natural language queries, based on [AudioSep](https://github.com/Audio-AGI/AudioSep).
261
+ - **FlowSep**: A flow-matching based separation model with text conditioning, based on [FlowSep](https://github.com/Audio-AGI/FlowSep).
262
+
263
+ **How to use:**
264
+ 1. Upload an audio file (mix of sounds)
265
+ 2. Describe what you want to separate (e.g., "piano", "speech", "dog barking")
266
+ 3. Select a model and click Separate
267
+
268
+ [[Paper]](https://arxiv.org/abs/2601.22599) | [[Code]](https://github.com/ShandaAI/Hive) | [[Hive Dataset]](https://huggingface.co/datasets/ShandaAI/Hive) | [[Demo Page]](https://shandaai.github.io/Hive/)
269
+ """
270
+
271
+ EXAMPLES = [
272
+ ["examples/acoustic_guitar.wav", "acoustic guitar"],
273
+ ["examples/laughing.wav", "laughing"],
274
+ ["examples/ticktok_piano.wav", "A ticktock sound playing at the same rhythm with piano"],
275
+ ["examples/water_drops.wav", "water drops"],
276
+ ["examples/noisy_speech.wav", "speech"],
277
+ ]
278
+
279
+ with gr.Blocks(
280
+ theme=gr.themes.Soft(),
281
+ title="Universal Sound Separation on HIVE",
282
+ ) as demo:
283
+ gr.Markdown(DESCRIPTION)
284
+
285
+ with gr.Row():
286
+ with gr.Column():
287
+ audio_input = gr.Audio(label="Input Mixture Audio", type="filepath")
288
+ text_input = gr.Textbox(
289
+ label="Text Query",
290
+ placeholder='e.g. "dog barking", "piano playing"',
291
+ )
292
+ model_choice = gr.Dropdown(
293
+ choices=["AudioSep-hive", "FlowSep-hive"],
294
+ value="AudioSep-hive",
295
+ label="Select Model",
296
+ )
297
+ submit_btn = gr.Button("Separate", variant="primary")
298
+
299
+ with gr.Column():
300
+ audio_output = gr.Audio(label="Separated Audio")
301
+
302
+ submit_btn.click(
303
+ fn=inference,
304
+ inputs=[audio_input, text_input, model_choice],
305
+ outputs=audio_output,
306
+ )
307
+
308
+ gr.Markdown("## Examples")
309
+ gr.Examples(examples=EXAMPLES, inputs=[audio_input, text_input])
310
+
311
+ DEBUG = False
312
+
313
+ def run_debug():
314
+ examples_dir = os.path.join(os.path.dirname(__file__), "examples")
315
+ test_path = os.path.join(examples_dir, "acoustic_guitar.wav")
316
+ test_text = "acoustic guitar"
317
+ print("\n" + "=" * 50)
318
+ print("[DEBUG] Starting inference test for both models")
319
+ print("=" * 50)
320
+
321
+ if not os.path.exists(test_path):
322
+ print(f"[DEBUG] Skip: {test_path} not found")
323
+ return
324
+
325
+ print(f"\n[DEBUG] Using test audio: {test_path}")
326
+
327
+ print("\n" + "-" * 40)
328
+ print("[DEBUG] AudioSep inference")
329
+ print("-" * 40)
330
+ print("[DEBUG] Loading AudioSep model...")
331
+ out_audiosep = separate_audiosep(test_path, test_text)
332
+ print(f"[DEBUG] AudioSep done. Output sr={out_audiosep[0]}, shape={np.array(out_audiosep[1]).shape}")
333
+
334
+ print("\n" + "-" * 40)
335
+ print("[DEBUG] FlowSep inference")
336
+ print("-" * 40)
337
+ print("[DEBUG] Loading FlowSep model...")
338
+ out_flowsep = separate_flowsep(test_path, test_text)
339
+ print(f"[DEBUG] FlowSep done. Output sr={out_flowsep[0]}, shape={np.array(out_flowsep[1]).shape}")
340
+
341
+ print("\n" + "=" * 50)
342
+ print("[DEBUG] Both models passed inference test")
343
+ print("=" * 50 + "\n")
344
+
345
+
346
+ if DEBUG:
347
+ run_debug()
348
+
349
+ demo.launch()
audiosep/__pycache__/utils.cpython-310.pyc ADDED
Binary file (9.65 kB). View file
 
audiosep/__pycache__/utils.cpython-312.pyc ADDED
Binary file (15.4 kB). View file
 
audiosep/config/audiosep_base.yaml ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ task_name: AudioSep
3
+
4
+ data:
5
+ datafiles:
6
+ - 'datafiles/template.json'
7
+
8
+ sampling_rate: 32000
9
+ segment_seconds: 5
10
+ loudness_norm:
11
+ lower_db: -10
12
+ higher_db: 10
13
+ max_mix_num: 2
14
+
15
+ model:
16
+ query_net: CLAP
17
+ condition_size: 512
18
+ model_type: ResUNet30
19
+ input_channels: 1
20
+ output_channels: 1
21
+ resume_checkpoint: ""
22
+ use_text_ratio: 1.0
23
+
24
+ train:
25
+ optimizer:
26
+ optimizer_type: AdamW
27
+ learning_rate: 1e-3
28
+ warm_up_steps: 10000
29
+ reduce_lr_steps: 1000000
30
+ lr_lambda_type: constant_warm_up
31
+ num_nodes: 1
32
+ num_workers: 6
33
+ loss_type: l1_wav
34
+ sync_batchnorm: True
35
+ batch_size_per_device: 12
36
+ steps_per_epoch: 10000 # Every 10000 steps is called an `epoch`.
37
+ evaluate_step_frequency: 10000 # Evaluate every #evaluate_step_frequency steps.
38
+ save_step_frequency: 20000 # Save every #save_step_frequency steps.
39
+ early_stop_steps: 10000001
40
+ random_seed: 1234
41
+
audiosep/models/CLAP/__init__.py ADDED
File without changes
audiosep/models/CLAP/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (199 Bytes). View file
 
audiosep/models/CLAP/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (232 Bytes). View file
 
audiosep/models/CLAP/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (209 Bytes). View file
 
audiosep/models/CLAP/open_clip/__init__.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .factory import (
2
+ list_models,
3
+ create_model,
4
+ create_model_and_transforms,
5
+ add_model_config,
6
+ )
7
+ from .loss import ClipLoss, gather_features, LPLoss, lp_gather_features, LPMetrics
8
+ from .model import (
9
+ CLAP,
10
+ CLAPTextCfg,
11
+ CLAPVisionCfg,
12
+ CLAPAudioCfp,
13
+ convert_weights_to_fp16,
14
+ trace_model,
15
+ )
16
+ from .openai import load_openai_model, list_openai_models
17
+ from .pretrained import (
18
+ list_pretrained,
19
+ list_pretrained_tag_models,
20
+ list_pretrained_model_tags,
21
+ get_pretrained_url,
22
+ download_pretrained,
23
+ )
24
+ from .tokenizer import SimpleTokenizer, tokenize
25
+ from .transform import image_transform
audiosep/models/CLAP/open_clip/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (1.01 kB). View file
 
audiosep/models/CLAP/open_clip/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (1.35 kB). View file
 
audiosep/models/CLAP/open_clip/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (1.06 kB). View file
 
audiosep/models/CLAP/open_clip/__pycache__/factory.cpython-310.pyc ADDED
Binary file (6.71 kB). View file
 
audiosep/models/CLAP/open_clip/__pycache__/factory.cpython-311.pyc ADDED
Binary file (13.5 kB). View file
 
audiosep/models/CLAP/open_clip/__pycache__/factory.cpython-312.pyc ADDED
Binary file (11.3 kB). View file
 
audiosep/models/CLAP/open_clip/__pycache__/feature_fusion.cpython-310.pyc ADDED
Binary file (4.16 kB). View file
 
audiosep/models/CLAP/open_clip/__pycache__/feature_fusion.cpython-311.pyc ADDED
Binary file (9.94 kB). View file
 
audiosep/models/CLAP/open_clip/__pycache__/feature_fusion.cpython-312.pyc ADDED
Binary file (9.12 kB). View file
 
audiosep/models/CLAP/open_clip/__pycache__/htsat.cpython-310.pyc ADDED
Binary file (30.8 kB). View file
 
audiosep/models/CLAP/open_clip/__pycache__/htsat.cpython-311.pyc ADDED
Binary file (57.8 kB). View file
 
audiosep/models/CLAP/open_clip/__pycache__/htsat.cpython-312.pyc ADDED
Binary file (54.1 kB). View file
 
audiosep/models/CLAP/open_clip/__pycache__/loss.cpython-310.pyc ADDED
Binary file (8.01 kB). View file
 
audiosep/models/CLAP/open_clip/__pycache__/loss.cpython-311.pyc ADDED
Binary file (17.8 kB). View file
 
audiosep/models/CLAP/open_clip/__pycache__/loss.cpython-312.pyc ADDED
Binary file (16.1 kB). View file
 
audiosep/models/CLAP/open_clip/__pycache__/model.cpython-310.pyc ADDED
Binary file (24.2 kB). View file
 
audiosep/models/CLAP/open_clip/__pycache__/model.cpython-311.pyc ADDED
Binary file (48.2 kB). View file
 
audiosep/models/CLAP/open_clip/__pycache__/model.cpython-312.pyc ADDED
Binary file (45.4 kB). View file
 
audiosep/models/CLAP/open_clip/__pycache__/openai.cpython-310.pyc ADDED
Binary file (4.56 kB). View file
 
audiosep/models/CLAP/open_clip/__pycache__/openai.cpython-311.pyc ADDED
Binary file (8.46 kB). View file
 
audiosep/models/CLAP/open_clip/__pycache__/openai.cpython-312.pyc ADDED
Binary file (7.38 kB). View file
 
audiosep/models/CLAP/open_clip/__pycache__/pann_model.cpython-310.pyc ADDED
Binary file (13.1 kB). View file
 
audiosep/models/CLAP/open_clip/__pycache__/pann_model.cpython-311.pyc ADDED
Binary file (30 kB). View file
 
audiosep/models/CLAP/open_clip/__pycache__/pann_model.cpython-312.pyc ADDED
Binary file (27.2 kB). View file
 
audiosep/models/CLAP/open_clip/__pycache__/pretrained.cpython-310.pyc ADDED
Binary file (5.08 kB). View file
 
audiosep/models/CLAP/open_clip/__pycache__/pretrained.cpython-311.pyc ADDED
Binary file (8.33 kB). View file
 
audiosep/models/CLAP/open_clip/__pycache__/pretrained.cpython-312.pyc ADDED
Binary file (7.14 kB). View file
 
audiosep/models/CLAP/open_clip/__pycache__/timm_model.cpython-310.pyc ADDED
Binary file (3.48 kB). View file
 
audiosep/models/CLAP/open_clip/__pycache__/timm_model.cpython-311.pyc ADDED
Binary file (5.82 kB). View file
 
audiosep/models/CLAP/open_clip/__pycache__/timm_model.cpython-312.pyc ADDED
Binary file (5.05 kB). View file
 
audiosep/models/CLAP/open_clip/__pycache__/tokenizer.cpython-310.pyc ADDED
Binary file (7.4 kB). View file
 
audiosep/models/CLAP/open_clip/__pycache__/tokenizer.cpython-311.pyc ADDED
Binary file (13.9 kB). View file
 
audiosep/models/CLAP/open_clip/__pycache__/tokenizer.cpython-312.pyc ADDED
Binary file (11.1 kB). View file
 
audiosep/models/CLAP/open_clip/__pycache__/transform.cpython-310.pyc ADDED
Binary file (1.02 kB). View file
 
audiosep/models/CLAP/open_clip/__pycache__/transform.cpython-311.pyc ADDED
Binary file (1.6 kB). View file
 
audiosep/models/CLAP/open_clip/__pycache__/transform.cpython-312.pyc ADDED
Binary file (1.36 kB). View file
 
audiosep/models/CLAP/open_clip/__pycache__/utils.cpython-310.pyc ADDED
Binary file (10.5 kB). View file
 
audiosep/models/CLAP/open_clip/__pycache__/utils.cpython-311.pyc ADDED
Binary file (19.9 kB). View file
 
audiosep/models/CLAP/open_clip/__pycache__/utils.cpython-312.pyc ADDED
Binary file (16.8 kB). View file
 
audiosep/models/CLAP/open_clip/bert.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import BertTokenizer, BertModel
2
+
3
+ tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
4
+ model = BertModel.from_pretrained("bert-base-uncased")
5
+ text = "Replace me by any text you'd like."
6
+
7
+
8
+ def bert_embeddings(text):
9
+ # text = "Replace me by any text you'd like."
10
+ encoded_input = tokenizer(text, return_tensors="pt")
11
+ output = model(**encoded_input)
12
+ return output
13
+
14
+
15
+ from transformers import RobertaTokenizer, RobertaModel
16
+
17
+ tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
18
+ model = RobertaModel.from_pretrained("roberta-base")
19
+ text = "Replace me by any text you'd like."
20
+
21
+
22
+ def Roberta_embeddings(text):
23
+ # text = "Replace me by any text you'd like."
24
+ encoded_input = tokenizer(text, return_tensors="pt")
25
+ output = model(**encoded_input)
26
+ return output
27
+
28
+
29
+ from transformers import BartTokenizer, BartModel
30
+
31
+ tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
32
+ model = BartModel.from_pretrained("facebook/bart-base")
33
+ text = "Replace me by any text you'd like."
34
+
35
+
36
+ def bart_embeddings(text):
37
+ # text = "Replace me by any text you'd like."
38
+ encoded_input = tokenizer(text, return_tensors="pt")
39
+ output = model(**encoded_input)
40
+ return output