| _target_: genmo.network.genmo_diffusion.GENMODiffusion | |
| args: ${pipeline.args} | |
| latent_dim: ${.model_cfg.denoiser.latent_dim} | |
| cond_merge_strategy: "add" | |
| music_mask_prob: ${.model_cfg.denoiser.music_mask_prob} | |
| speech_mask_prob: ${.model_cfg.denoiser.speech_mask_prob} | |
| encoded_music_dim: ${pipeline.args.encoded_music_dim} | |
| model_cfg: | |
| diffusion: ${model_cfg.diffusion} | |
| denoiser: | |
| _target_: genmo.network.genmo_denoiser.NetworkEncoderRoPE | |
| output_dim: 151 | |
| xt_dim: ${.output_dim} | |
| njoints: ${.xt_dim} | |
| text_mask_prob: 0.1 | |
| music_mask_prob: 0.1 | |
| speech_mask_prob: 0.1 | |
| use_text_pos_enc: true | |
| text_encoder_cfg: | |
| mode: all | |
| cross_attn_type: mha | |
| latent_dim: 1024 | |
| num_layers: 16 | |
| num_heads: 8 | |
| mlp_ratio: 4 | |