{ "model_type": "gem", "body_model": "soma", "denoiser": { "latent_dim": 1024, "num_layers": 16, "num_heads": 8, "mlp_ratio": 4, "scale_params_dim": 69, "scale_comps_dim": 28, "fix_scale_comps": true, "encode_text": false, "encoded_text_dim": 4096, "text_encoder_cfg": { "mode": "none", "cross_attn_type": "mha" }, "text_mask_prob": 0.1, "music_mask_prob": 0.1, "speech_mask_prob": 0.1, "use_text_pos_enc": true }, "endecoder": { "stats_name": "MM_V2_SOMA_METROSIM", "encode_type": "soma_v2", "feat_dim": 585, "clip_std": true }, "diffusion": { "sampler": "ddim", "noise_schedule": "cosine", "test_timestep_respacing": "50", "ddim_eta": 0.0, "guidance_param": 2.5 }, "pipeline": { "remove_fingers": true, "extract_features": false } }