JusperLee commited on
Commit
fad6e4b
·
0 Parent(s):

Duplicate from ShandaAI/FlowSep-hive

Browse files
Files changed (5) hide show
  1. .gitattributes +35 -0
  2. README.md +33 -0
  3. config.yaml +135 -0
  4. flowsep_hive.ckpt +3 -0
  5. vae.ckpt +3 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ language:
4
+ - en
5
+ tags:
6
+ - audio
7
+ - sound-separation
8
+ - audio-to-audio
9
+ - flowsep
10
+ datasets:
11
+ - ShandaAI/Hive
12
+ ---
13
+
14
+ # FlowSep-hive
15
+
16
+ ## Model Description
17
+
18
+ **FlowSep-hive** is a data-efficient, query-based universal sound separation model trained on the [Hive dataset](https://huggingface.co/datasets/ShandaAI/Hive). By leveraging the high-quality, semantically consistent Hive dataset, this model achieves competitive separation accuracy and perceptual quality comparable to state-of-the-art models (such as SAM-Audio) while utilizing only a fraction (~0.2%) of the training data volume.
19
+
20
+ This model is developed by **Shanda AI Research Tokyo** and is introduced in the paper: [A Semantically Consistent Dataset for Data-Efficient Query-Based Universal Sound Separation](https://arxiv.org/abs/2601.22599).
21
+
22
+ ## Model Details
23
+
24
+ - **Model Type:​** Query-Based Universal Sound Separation
25
+ - **Language(s):​** English (for text queries)
26
+ - **License:​** Apache 2.0 (Please update if different)
27
+ - **Trained on:​** [ShandaAI/Hive](https://huggingface.co/datasets/ShandaAI/Hive) (2,442 hours of raw audio, 19.6M mixtures)
28
+ - **Paper:​** [arXiv:2601.22599](https://arxiv.org/abs/2601.22599)
29
+ - **Code Repository:​** [GitHub - ShandaAI/Hive](https://github.com/ShandaAI/Hive)
30
+
31
+ ## Uses
32
+
33
+ The model is intended for universal sound separation tasks, allowing users to extract specific sounds from complex audio mixtures using multimodal prompts (e.g., text descriptions or audio queries).
config.yaml ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ metadata_root: "models/FlowSep/metadata-master/processed/dataset_root.json"
2
+ log_directory: "models/FlowSep/model_logs_curationed"
3
+ exp_group: "lass"
4
+ exp_name: "2channel_flow"
5
+ project: "FlowSep"
6
+
7
+ data:
8
+ train: ["audiocaps"]
9
+ val: "audiocaps"
10
+ test: "audiocaps"
11
+ mix_train: "train"
12
+ class_label_indices: "audiocaps"
13
+ dataloader_add_ons: []
14
+ mix_audio: true
15
+ random_empty: 0.0001
16
+
17
+ step:
18
+ validation_every_n_epochs: 1
19
+ save_checkpoint_every_n_steps: 100000
20
+ max_steps: 4000000
21
+ save_top_k: 4
22
+
23
+ preprocessing:
24
+ audio:
25
+ sampling_rate: 16000
26
+ max_wav_value: 32768.0
27
+ duration: 10.24
28
+ stft:
29
+ filter_length: 1024
30
+ hop_length: 160
31
+ win_length: 1024
32
+ mel:
33
+ n_mel_channels: 64
34
+ mel_fmin: 0
35
+ mel_fmax: 8000
36
+
37
+ augmentation:
38
+ mixup: 0.0
39
+
40
+ model:
41
+ target: latent_diffusion.models.ddpm_flow.LatentDiffusion
42
+ params:
43
+ base_learning_rate: 5.0e-05
44
+ sampling_rate: 16000
45
+ batchsize: 8
46
+ linear_start: 0.0015
47
+ linear_end: 0.0195
48
+ num_timesteps_cond: 1
49
+ log_every_t: 200
50
+ timesteps: 1000
51
+ unconditional_prob_cfg: 0.1
52
+ parameterization: eps # [eps, x0, v]
53
+ first_stage_key: fbank
54
+ latent_t_size: 256 # TODO might need to change
55
+ latent_f_size: 16
56
+ channels: 8 # TODO might need to change
57
+ extra_channels: true
58
+ extra_channel_key: mixed_mel
59
+ monitor: val/loss_simple_ema
60
+ scale_by_std: true
61
+ clap_trainable: false
62
+ retrival_num: 0
63
+ use_clap: false
64
+ euler: true
65
+ unet_config:
66
+ target: latent_diffusion.modules.diffusionmodules.openaimodel.UNetModel
67
+ params:
68
+ image_size: 64 # Ignore this parameter
69
+ context_dim:
70
+ - 1024
71
+ in_channels: 16 # The input channel of the UNet model
72
+ out_channels: 16 # TODO might need to change
73
+ model_channels: 128 # TODO might need to change
74
+ attention_resolutions:
75
+ - 8
76
+ - 4
77
+ - 2
78
+ num_res_blocks: 2
79
+ channel_mult:
80
+ - 1
81
+ - 2
82
+ - 3
83
+ - 5
84
+ num_head_channels: 32
85
+ use_spatial_transformer: true
86
+ transformer_depth: 1
87
+ first_stage_config:
88
+ base_learning_rate: 4.5e-05
89
+ target: latent_encoder.autoencoder.AutoencoderKL
90
+ params:
91
+ # reload_from_ckpt: "model_logs/pretrained/vae.ckpt"
92
+ reload_from_ckpt: "vae.ckpt"
93
+ batchsize: 2
94
+ monitor: val/rec_loss
95
+ image_key: fbank
96
+ subband: 1
97
+ embed_dim: 8
98
+ time_shuffle: 1
99
+ lossconfig:
100
+ target: latent_diffusion.modules.losses.LPIPSWithDiscriminator
101
+ params:
102
+ disc_start: 50001
103
+ kl_weight: 1.0
104
+ disc_weight: 0.5
105
+ disc_in_channels: 1
106
+ ddconfig:
107
+ double_z: true
108
+ z_channels: 8
109
+ resolution: 256
110
+ mel_bins: 64
111
+ downsample_time: false
112
+ in_channels: 1
113
+ out_ch: 1
114
+ ch: 128
115
+ ch_mult:
116
+ - 1
117
+ - 2
118
+ - 4
119
+ num_res_blocks: 2
120
+ attn_resolutions: []
121
+ dropout: 0.0
122
+ cond_stage_config:
123
+ crossattn_text:
124
+ cond_stage_key: caption
125
+ conditioning_key: crossattn
126
+ target: latent_diffusion.modules.encoders.modules.FlanT5HiddenState
127
+ params:
128
+ emb_num: 1
129
+ input_caption: true
130
+
131
+
132
+ evaluation_params:
133
+ unconditional_guidance_scale: 1.0 #
134
+ ddim_sampling_steps: 10
135
+ n_candidates_per_samples: 1
flowsep_hive.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b7b5379592251dc655757a7dbac109346cbc6c8763dd73efd19326fe6c39fdd
3
+ size 5963371062
vae.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b97ed587aaf2ecaadc51508da5edc353bed4a85f2f3e77a613fd040bfcef0fbe
3
+ size 347515318