stable-audio-3-medium / model_config.json

Update model_config.json

a30034d verified 7 days ago

10.3 kB

	{
	"model_type": "diffusion_cond_inpaint",
	"sample_size": 16777216,
	"sample_rate": 44100,
	"audio_channels": 2,
	"model": {
	"pretransform": {
	"type": "autoencoder",
	"iterate_batch": false,
	"chunked": true,
	"config": {
	"pretransform": {
	"type": "patched",
	"config": {
	"patch_size": 256,
	"channels": 2
	}
	},
	"encoder": {
	"type": "taae_v2",
	"requires_grad": false,
	"config": {
	"in_channels": 512,
	"channels": 256,
	"c_mults": [
	6
	],
	"strides": [
	16
	],
	"latent_dim": 256,
	"transformer_depths": [
	12
	],
	"use_snake": false,
	"use_dilated_conv": false,
	"checkpointing": true,
	"conformer": false,
	"layer_scale": false,
	"differential": true,
	"conv_bias": false,
	"mapping_style": "none",
	"dim_heads": 64,
	"enable_inner_layer_dropout": false,
	"sliding_window": [
	1,
	1
	],
	"variable_stride": true,
	"use_flash": true,
	"mask_noise": 0.001
	}
	},
	"decoder": {
	"type": "taae_v2",
	"requires_grad": false,
	"config": {
	"out_channels": 512,
	"channels": 256,
	"c_mults": [
	6
	],
	"strides": [
	16
	],
	"latent_dim": 256,
	"transformer_depths": [
	12
	],
	"sinusoidal_blocks": [
	8
	],
	"use_snake": false,
	"use_dilated_conv": false,
	"checkpointing": false,
	"conformer": false,
	"layer_scale": false,
	"differential": true,
	"conv_bias": false,
	"mapping_style": "none",
	"dim_heads": 64,
	"enable_inner_layer_dropout": false,
	"sliding_window": [
	1,
	1
	],
	"variable_stride": true,
	"use_flash": true,
	"mask_noise": 0.1
	}
	},
	"bottleneck": {
	"type": "softnorm",
	"config": {
	"dim": 256,
	"noise_augment_dim": 0,
	"noise_regularize": true,
	"auto_scale": true
	}
	},
	"latent_dim": 256,
	"downsampling_ratio": 4096,
	"io_channels": 2
	}
	},
	"conditioning": {
	"configs": [
	{
	"id": "prompt",
	"type": "t5gemma",
	"config": {
	"max_length": 256,
	"padding_mode": "learned",
	"repo_id": "cocktailpeanut/stable-audio-3-medium",
	"subfolder": "t5gemma-b-b-ul2"
	}
	},
	{
	"id": "seconds_total",
	"type": "number",
	"config": {
	"min_val": 0,
	"max_val": 384,
	"fourier_features_type": "expo"
	}
	}
	],
	"cond_dim": 768
	},
	"diffusion": {
	"cross_attention_cond_ids": [
	"prompt",
	"seconds_total"
	],
	"global_cond_ids": [
	"seconds_total"
	],
	"local_add_cond_ids": [
	"inpaint_mask",
	"inpaint_masked_input"
	],
	"type": "dit",
	"diffusion_objective": "rf_denoiser",
	"mask_padding_attention": true,
	"use_effective_length_for_schedule": true,
	"distribution_shift_options": {
	"min_length": 256,
	"max_length": 4096
	},
	"config": {
	"io_channels": 256,
	"embed_dim": 1536,
	"depth": 24,
	"num_heads": 24,
	"cond_token_dim": 768,
	"global_cond_dim": 768,
	"local_add_cond_dim": 257,
	"global_cond_type": "adaLN",
	"timestep_features_type": "expo",
	"attn_kwargs": {
	"qk_norm": "rms",
	"differential": true
	},
	"norm_type": "rms_norm",
	"norm_kwargs": {
	"force_fp32": true
	},
	"ff_kwargs": {
	"mult": 4.0
	},
	"num_memory_tokens": 64
	}
	},
	"io_channels": 256
	},
	"training": {
	"use_ema": true,
	"log_loss_info": false,
	"pre_encoded": true,
	"ot_coupling": true,
	"silence_extension_scale_seconds": 4.0,
	"timestep_sampler": "trunc_logit_normal",
	"mask_loss_weight": 1.0,
	"cfg_dropout_prob": 0.1,
	"inpainting": {
	"mask_kwargs": {
	"mask_type_probabilities": [
	0.1,
	0.8,
	0.1
	]
	}
	},
	"arc": {
	"noise_dist": {
	"generator": "trunc_logit_normal",
	"discriminator": "logit_normal"
	},
	"disc_update_interval": 2,
	"use_model_as_discriminator": true,
	"discriminator_base_ckpt": "/path/to/discriminator/ckpt.pt",
	"discriminator": {
	"type": "dilated_conv",
	"dit_hidden_layer": [
	18
	],
	"weights": {
	"generator": 1.0,
	"discriminator": 1.0
	},
	"reset_every": 250,
	"loss_type": "relativistic",
	"config": {
	"hidden_dim": 1024,
	"dilations": [
	1,
	1,
	1,
	1,
	1
	]
	},
	"disc_hinge_loss": false,
	"contrastive": true,
	"include_grad_penalties": false
	}
	},
	"optimizer_configs": {
	"diffusion": {
	"optimizer": {
	"type": "MuonAdamW",
	"config": {
	"muon_lr": 1e-05,
	"muon_momentum": 0.95,
	"adam_lr": 1e-06,
	"adam_betas": [
	0.9,
	0.95
	],
	"adam_weight_decay": 0.01,
	"fused_layer_patterns": [
	".to_qkv.",
	".to_kv.",
	".to_q.",
	".ff..proj.*"
	]
	}
	},
	"scheduler": {
	"type": "InverseLR",
	"config": {
	"inv_gamma": 1000000,
	"power": 0.5,
	"warmup": 0.95
	}
	}
	},
	"discriminator": {
	"optimizer": {
	"type": "MuonAdamW",
	"config": {
	"muon_lr": 1e-05,
	"muon_momentum": 0.95,
	"adam_lr": 1e-06,
	"adam_betas": [
	0.9,
	0.95
	],
	"adam_weight_decay": 0.01,
	"fused_layer_patterns": [
	".to_qkv.",
	".to_kv.",
	".to_q.",
	".ff..proj.*"
	]
	}
	},
	"scheduler": {
	"type": "InverseLR",
	"config": {
	"inv_gamma": 1000000,
	"power": 0.5,
	"warmup": 0.9
	}
	}
	}
	},
	"demo": {
	"demo_every": 500,
	"demo_steps": 8,
	"num_demos": 2,
	"demo_cond": [
	{
	"prompt": "Meditative lo-fi ambient piano jazz, soft acoustic drum kit",
	"seconds_total": 190
	},
	{
	"prompt": "A tropical house track with upbeat melodies, a driving bassline, and cheery vibes",
	"seconds_total": 180
	}
	],
	"demo_cfg_scales": [
	1
	]
	}
	}
	}