File size: 3,404 Bytes
6e76b77 017a0d2 7af3360 017a0d2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 | metadata_root: "models/FlowSep/metadata-master/processed/dataset_root.json"
log_directory: "models/FlowSep/model_logs_curationed"
exp_group: "lass"
exp_name: "2channel_flow"
project: "FlowSep"
data:
train: ["audiocaps"]
val: "audiocaps"
test: "audiocaps"
mix_train: "train"
class_label_indices: "audiocaps"
dataloader_add_ons: []
mix_audio: true
random_empty: 0.0001
step:
validation_every_n_epochs: 1
save_checkpoint_every_n_steps: 100000
max_steps: 4000000
save_top_k: 4
preprocessing:
audio:
sampling_rate: 16000
max_wav_value: 32768.0
duration: 10.24
stft:
filter_length: 1024
hop_length: 160
win_length: 1024
mel:
n_mel_channels: 64
mel_fmin: 0
mel_fmax: 8000
augmentation:
mixup: 0.0
model:
target: latent_diffusion.models.ddpm_flow.LatentDiffusion
params:
base_learning_rate: 5.0e-05
sampling_rate: 16000
batchsize: 8
linear_start: 0.0015
linear_end: 0.0195
num_timesteps_cond: 1
log_every_t: 200
timesteps: 1000
unconditional_prob_cfg: 0.1
parameterization: eps # [eps, x0, v]
first_stage_key: fbank
latent_t_size: 256 # TODO might need to change
latent_f_size: 16
channels: 8 # TODO might need to change
extra_channels: true
extra_channel_key: mixed_mel
monitor: val/loss_simple_ema
scale_by_std: true
clap_trainable: false
retrival_num: 0
use_clap: false
euler: true
unet_config:
target: latent_diffusion.modules.diffusionmodules.openaimodel.UNetModel
params:
image_size: 64 # Ignore this parameter
context_dim:
- 1024
in_channels: 16 # The input channel of the UNet model
out_channels: 16 # TODO might need to change
model_channels: 128 # TODO might need to change
attention_resolutions:
- 8
- 4
- 2
num_res_blocks: 2
channel_mult:
- 1
- 2
- 3
- 5
num_head_channels: 32
use_spatial_transformer: true
transformer_depth: 1
first_stage_config:
base_learning_rate: 4.5e-05
target: latent_encoder.autoencoder.AutoencoderKL
params:
# reload_from_ckpt: "model_logs/pretrained/vae.ckpt"
reload_from_ckpt: "vae.ckpt"
batchsize: 2
monitor: val/rec_loss
image_key: fbank
subband: 1
embed_dim: 8
time_shuffle: 1
lossconfig:
target: latent_diffusion.modules.losses.LPIPSWithDiscriminator
params:
disc_start: 50001
kl_weight: 1.0
disc_weight: 0.5
disc_in_channels: 1
ddconfig:
double_z: true
z_channels: 8
resolution: 256
mel_bins: 64
downsample_time: false
in_channels: 1
out_ch: 1
ch: 128
ch_mult:
- 1
- 2
- 4
num_res_blocks: 2
attn_resolutions: []
dropout: 0.0
cond_stage_config:
crossattn_text:
cond_stage_key: caption
conditioning_key: crossattn
target: latent_diffusion.modules.encoders.modules.FlanT5HiddenState
params:
emb_num: 1
input_caption: true
evaluation_params:
unconditional_guidance_scale: 1.0 #
ddim_sampling_steps: 10
n_candidates_per_samples: 1 |