| { |
| "mulan": { |
| "sr": 24000, |
| "clip_secs": 10, |
| "dim_latent": 512, |
| "decoupled_contrastive_learning": true, |
| "hierarchical_contrastive_loss": false, |
| "hierarchical_contrastive_loss_layers": null, |
| "sigmoid_contrastive_loss": false, |
| "rank_contrast": true |
| }, |
| "audio_model": { |
| "name": "OpenMuQ/MuQ-large-msd-iter", |
| "model_dim": 1024, |
| "use_layer_idx": -1 |
| }, |
| "text_model": { |
| "name": "xlm-roberta-base", |
| "model_dim": null, |
| "use_layer_idx": -1 |
| }, |
| "audio_transformer": { |
| "dim": 768, |
| "tf_depth": 0, |
| "heads": 8, |
| "dim_head": 64, |
| "attn_dropout": 0, |
| "ff_dropout": 0, |
| "ff_mult": 4 |
| }, |
| "text_transformer": { |
| "dim": 768, |
| "tf_depth": 8, |
| "max_seq_len": 1024, |
| "dim_head": 64, |
| "heads": 8, |
| "attn_dropout": 0, |
| "ff_dropout": 0, |
| "ff_mult": 4 |
| } |
| } |