metadata_root: "models/FlowSep/metadata-master/processed/dataset_root.json" log_directory: "models/FlowSep/model_logs_curationed" exp_group: "lass" exp_name: "2channel_flow" project: "FlowSep" data: train: ["audiocaps"] val: "audiocaps" test: "audiocaps" mix_train: "train" class_label_indices: "audiocaps" dataloader_add_ons: [] mix_audio: true random_empty: 0.0001 step: validation_every_n_epochs: 1 save_checkpoint_every_n_steps: 100000 max_steps: 4000000 save_top_k: 4 preprocessing: audio: sampling_rate: 16000 max_wav_value: 32768.0 duration: 10.24 stft: filter_length: 1024 hop_length: 160 win_length: 1024 mel: n_mel_channels: 64 mel_fmin: 0 mel_fmax: 8000 augmentation: mixup: 0.0 model: target: latent_diffusion.models.ddpm_flow.LatentDiffusion params: base_learning_rate: 5.0e-05 sampling_rate: 16000 batchsize: 8 linear_start: 0.0015 linear_end: 0.0195 num_timesteps_cond: 1 log_every_t: 200 timesteps: 1000 unconditional_prob_cfg: 0.1 parameterization: eps # [eps, x0, v] first_stage_key: fbank latent_t_size: 256 # TODO might need to change latent_f_size: 16 channels: 8 # TODO might need to change extra_channels: true extra_channel_key: mixed_mel monitor: val/loss_simple_ema scale_by_std: true clap_trainable: false retrival_num: 0 use_clap: false euler: true unet_config: target: latent_diffusion.modules.diffusionmodules.openaimodel.UNetModel params: image_size: 64 # Ignore this parameter context_dim: - 1024 in_channels: 16 # The input channel of the UNet model out_channels: 16 # TODO might need to change model_channels: 128 # TODO might need to change attention_resolutions: - 8 - 4 - 2 num_res_blocks: 2 channel_mult: - 1 - 2 - 3 - 5 num_head_channels: 32 use_spatial_transformer: true transformer_depth: 1 first_stage_config: base_learning_rate: 4.5e-05 target: latent_encoder.autoencoder.AutoencoderKL params: # reload_from_ckpt: "model_logs/pretrained/vae.ckpt" reload_from_ckpt: "vae.ckpt" batchsize: 2 monitor: val/rec_loss image_key: fbank subband: 1 embed_dim: 8 time_shuffle: 1 lossconfig: target: latent_diffusion.modules.losses.LPIPSWithDiscriminator params: disc_start: 50001 kl_weight: 1.0 disc_weight: 0.5 disc_in_channels: 1 ddconfig: double_z: true z_channels: 8 resolution: 256 mel_bins: 64 downsample_time: false in_channels: 1 out_ch: 1 ch: 128 ch_mult: - 1 - 2 - 4 num_res_blocks: 2 attn_resolutions: [] dropout: 0.0 cond_stage_config: crossattn_text: cond_stage_key: caption conditioning_key: crossattn target: latent_diffusion.modules.encoders.modules.FlanT5HiddenState params: emb_num: 1 input_caption: true evaluation_params: unconditional_guidance_scale: 1.0 # ddim_sampling_steps: 10 n_candidates_per_samples: 1