| { | |
| "model_type": "gem", | |
| "body_model": "soma", | |
| "denoiser": { | |
| "latent_dim": 1024, | |
| "num_layers": 16, | |
| "num_heads": 8, | |
| "mlp_ratio": 4, | |
| "scale_params_dim": 69, | |
| "scale_comps_dim": 28, | |
| "fix_scale_comps": true, | |
| "encode_text": false, | |
| "encoded_text_dim": 4096, | |
| "text_encoder_cfg": { | |
| "mode": "none", | |
| "cross_attn_type": "mha" | |
| }, | |
| "text_mask_prob": 0.1, | |
| "music_mask_prob": 0.1, | |
| "speech_mask_prob": 0.1, | |
| "use_text_pos_enc": true | |
| }, | |
| "endecoder": { | |
| "stats_name": "MM_V2_SOMA_METROSIM", | |
| "encode_type": "soma_v2", | |
| "feat_dim": 585, | |
| "clip_std": true | |
| }, | |
| "diffusion": { | |
| "sampler": "ddim", | |
| "noise_schedule": "cosine", | |
| "test_timestep_respacing": "50", | |
| "ddim_eta": 0.0, | |
| "guidance_param": 2.5 | |
| }, | |
| "pipeline": { | |
| "remove_fingers": true, | |
| "extract_features": false | |
| } | |
| } |