Audio-to-Audio
English
audio
sound-separation
flowsep
JusperLee commited on
Commit
017a0d2
·
verified ·
1 Parent(s): f02e798

Upload config.yaml with huggingface_hub

Browse files
Files changed (1) hide show
  1. config.yaml +135 -0
config.yaml ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ metadata_root: "metadata-master/processed/dataset_root.json"
2
+ log_directory: "model_logs"
3
+ exp_group: "lass"
4
+ exp_name: "2channel_flow"
5
+ project: "FlowSep"
6
+
7
+ data:
8
+ train: ["audiocaps"]
9
+ val: "audiocaps"
10
+ test: "audiocaps"
11
+ mix_train: "train"
12
+ class_label_indices: "audiocaps"
13
+ dataloader_add_ons: []
14
+ mix_audio: true
15
+ random_empty: 0.0001
16
+
17
+ step:
18
+ validation_every_n_epochs: 1
19
+ save_checkpoint_every_n_steps: 100000
20
+ max_steps: 4000000
21
+ save_top_k: 4
22
+
23
+ preprocessing:
24
+ audio:
25
+ sampling_rate: 16000
26
+ max_wav_value: 32768.0
27
+ duration: 10.24
28
+ stft:
29
+ filter_length: 1024
30
+ hop_length: 160
31
+ win_length: 1024
32
+ mel:
33
+ n_mel_channels: 64
34
+ mel_fmin: 0
35
+ mel_fmax: 8000
36
+
37
+ augmentation:
38
+ mixup: 0.0
39
+
40
+ model:
41
+ target: latent_diffusion.models.ddpm_flow.LatentDiffusion
42
+ params:
43
+ base_learning_rate: 5.0e-05
44
+ sampling_rate: 16000
45
+ batchsize: 8
46
+ linear_start: 0.0015
47
+ linear_end: 0.0195
48
+ num_timesteps_cond: 1
49
+ log_every_t: 200
50
+ timesteps: 1000
51
+ unconditional_prob_cfg: 0.1
52
+ parameterization: eps # [eps, x0, v]
53
+ first_stage_key: fbank
54
+ latent_t_size: 256 # TODO might need to change
55
+ latent_f_size: 16
56
+ channels: 8 # TODO might need to change
57
+ extra_channels: true
58
+ extra_channel_key: mixed_mel
59
+ monitor: val/loss_simple_ema
60
+ scale_by_std: true
61
+ clap_trainable: false
62
+ retrival_num: 0
63
+ use_clap: false
64
+ euler: true
65
+ unet_config:
66
+ target: latent_diffusion.modules.diffusionmodules.openaimodel.UNetModel
67
+ params:
68
+ image_size: 64 # Ignore this parameter
69
+ context_dim:
70
+ - 1024
71
+ in_channels: 16 # The input channel of the UNet model
72
+ out_channels: 16 # TODO might need to change
73
+ model_channels: 128 # TODO might need to change
74
+ attention_resolutions:
75
+ - 8
76
+ - 4
77
+ - 2
78
+ num_res_blocks: 2
79
+ channel_mult:
80
+ - 1
81
+ - 2
82
+ - 3
83
+ - 5
84
+ num_head_channels: 32
85
+ use_spatial_transformer: true
86
+ transformer_depth: 1
87
+ first_stage_config:
88
+ base_learning_rate: 4.5e-05
89
+ target: latent_encoder.autoencoder.AutoencoderKL
90
+ params:
91
+ # reload_from_ckpt: "model_logs/pretrained/vae.ckpt"
92
+ reload_from_ckpt: "models/FlowSep/vae.ckpt"
93
+ batchsize: 2
94
+ monitor: val/rec_loss
95
+ image_key: fbank
96
+ subband: 1
97
+ embed_dim: 8
98
+ time_shuffle: 1
99
+ lossconfig:
100
+ target: latent_diffusion.modules.losses.LPIPSWithDiscriminator
101
+ params:
102
+ disc_start: 50001
103
+ kl_weight: 1.0
104
+ disc_weight: 0.5
105
+ disc_in_channels: 1
106
+ ddconfig:
107
+ double_z: true
108
+ z_channels: 8
109
+ resolution: 256
110
+ mel_bins: 64
111
+ downsample_time: false
112
+ in_channels: 1
113
+ out_ch: 1
114
+ ch: 128
115
+ ch_mult:
116
+ - 1
117
+ - 2
118
+ - 4
119
+ num_res_blocks: 2
120
+ attn_resolutions: []
121
+ dropout: 0.0
122
+ cond_stage_config:
123
+ crossattn_text:
124
+ cond_stage_key: caption
125
+ conditioning_key: crossattn
126
+ target: latent_diffusion.modules.encoders.modules.FlanT5HiddenState
127
+ params:
128
+ emb_num: 1
129
+ input_caption: true
130
+
131
+
132
+ evaluation_params:
133
+ unconditional_guidance_scale: 1.0 #
134
+ ddim_sampling_steps: 10
135
+ n_candidates_per_samples: 1