Audio-to-Audio
English
audio
sound-separation
flowsep
File size: 3,404 Bytes
6e76b77
 
017a0d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7af3360
017a0d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
metadata_root: "models/FlowSep/metadata-master/processed/dataset_root.json"
log_directory: "models/FlowSep/model_logs_curationed"
exp_group: "lass"
exp_name: "2channel_flow"
project: "FlowSep"

data: 
  train: ["audiocaps"]
  val: "audiocaps"
  test: "audiocaps"
  mix_train: "train"
  class_label_indices: "audiocaps"
  dataloader_add_ons: []
  mix_audio: true
  random_empty: 0.0001

step:
  validation_every_n_epochs: 1
  save_checkpoint_every_n_steps: 100000
  max_steps: 4000000
  save_top_k: 4

preprocessing:
  audio:
    sampling_rate: 16000
    max_wav_value: 32768.0
    duration: 10.24
  stft:
    filter_length: 1024
    hop_length: 160
    win_length: 1024
  mel:
    n_mel_channels: 64
    mel_fmin: 0
    mel_fmax: 8000 

augmentation:
  mixup: 0.0

model:
  target: latent_diffusion.models.ddpm_flow.LatentDiffusion
  params:
    base_learning_rate: 5.0e-05
    sampling_rate: 16000
    batchsize: 8
    linear_start: 0.0015
    linear_end: 0.0195
    num_timesteps_cond: 1
    log_every_t: 200
    timesteps: 1000
    unconditional_prob_cfg: 0.1
    parameterization: eps # [eps, x0, v]
    first_stage_key: fbank
    latent_t_size: 256 # TODO might need to change
    latent_f_size: 16
    channels: 8 # TODO might need to change
    extra_channels: true
    extra_channel_key: mixed_mel
    monitor: val/loss_simple_ema
    scale_by_std: true
    clap_trainable: false
    retrival_num: 0
    use_clap: false
    euler: true
    unet_config:
      target: latent_diffusion.modules.diffusionmodules.openaimodel.UNetModel
      params:
        image_size: 64 # Ignore this parameter
        context_dim: 
        - 1024
        in_channels: 16 # The input channel of the UNet model
        out_channels: 16 # TODO might need to change
        model_channels: 128 # TODO might need to change
        attention_resolutions:
        - 8
        - 4
        - 2
        num_res_blocks: 2
        channel_mult:
        - 1
        - 2
        - 3
        - 5
        num_head_channels: 32
        use_spatial_transformer: true
        transformer_depth: 1
    first_stage_config:
      base_learning_rate: 4.5e-05
      target: latent_encoder.autoencoder.AutoencoderKL
      params:
        # reload_from_ckpt: "model_logs/pretrained/vae.ckpt"
        reload_from_ckpt: "vae.ckpt"
        batchsize: 2
        monitor: val/rec_loss
        image_key: fbank
        subband: 1
        embed_dim: 8
        time_shuffle: 1
        lossconfig:
          target: latent_diffusion.modules.losses.LPIPSWithDiscriminator
          params:
            disc_start: 50001
            kl_weight: 1.0
            disc_weight: 0.5
            disc_in_channels: 1
        ddconfig:
          double_z: true
          z_channels: 8
          resolution: 256
          mel_bins: 64
          downsample_time: false
          in_channels: 1
          out_ch: 1
          ch: 128
          ch_mult:
          - 1
          - 2
          - 4
          num_res_blocks: 2
          attn_resolutions: []
          dropout: 0.0
    cond_stage_config:
      crossattn_text:
        cond_stage_key: caption
        conditioning_key: crossattn
        target: latent_diffusion.modules.encoders.modules.FlanT5HiddenState
        params:
          emb_num: 1
          input_caption: true


    evaluation_params:
      unconditional_guidance_scale: 1.0 # 
      ddim_sampling_steps: 10
      n_candidates_per_samples: 1