Instructions to use cocktailpeanut/stable-audio-3-medium with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Stable Audio 3
How to use cocktailpeanut/stable-audio-3-medium with Stable Audio 3:
# No code snippets available yet for this library. # To use this model, check the repository files and the library's documentation. # Want to help? PRs adding snippets are welcome at: # https://github.com/huggingface/huggingface.js
- Notebooks
- Google Colab
- Kaggle
| { | |
| "model_type": "diffusion_cond_inpaint", | |
| "sample_size": 16777216, | |
| "sample_rate": 44100, | |
| "audio_channels": 2, | |
| "model": { | |
| "pretransform": { | |
| "type": "autoencoder", | |
| "iterate_batch": false, | |
| "chunked": true, | |
| "config": { | |
| "pretransform": { | |
| "type": "patched", | |
| "config": { | |
| "patch_size": 256, | |
| "channels": 2 | |
| } | |
| }, | |
| "encoder": { | |
| "type": "taae_v2", | |
| "requires_grad": false, | |
| "config": { | |
| "in_channels": 512, | |
| "channels": 256, | |
| "c_mults": [ | |
| 6 | |
| ], | |
| "strides": [ | |
| 16 | |
| ], | |
| "latent_dim": 256, | |
| "transformer_depths": [ | |
| 12 | |
| ], | |
| "use_snake": false, | |
| "use_dilated_conv": false, | |
| "checkpointing": true, | |
| "conformer": false, | |
| "layer_scale": false, | |
| "differential": true, | |
| "conv_bias": false, | |
| "mapping_style": "none", | |
| "dim_heads": 64, | |
| "enable_inner_layer_dropout": false, | |
| "sliding_window": [ | |
| 1, | |
| 1 | |
| ], | |
| "variable_stride": true, | |
| "use_flash": true, | |
| "mask_noise": 0.001 | |
| } | |
| }, | |
| "decoder": { | |
| "type": "taae_v2", | |
| "requires_grad": false, | |
| "config": { | |
| "out_channels": 512, | |
| "channels": 256, | |
| "c_mults": [ | |
| 6 | |
| ], | |
| "strides": [ | |
| 16 | |
| ], | |
| "latent_dim": 256, | |
| "transformer_depths": [ | |
| 12 | |
| ], | |
| "sinusoidal_blocks": [ | |
| 8 | |
| ], | |
| "use_snake": false, | |
| "use_dilated_conv": false, | |
| "checkpointing": false, | |
| "conformer": false, | |
| "layer_scale": false, | |
| "differential": true, | |
| "conv_bias": false, | |
| "mapping_style": "none", | |
| "dim_heads": 64, | |
| "enable_inner_layer_dropout": false, | |
| "sliding_window": [ | |
| 1, | |
| 1 | |
| ], | |
| "variable_stride": true, | |
| "use_flash": true, | |
| "mask_noise": 0.1 | |
| } | |
| }, | |
| "bottleneck": { | |
| "type": "softnorm", | |
| "config": { | |
| "dim": 256, | |
| "noise_augment_dim": 0, | |
| "noise_regularize": true, | |
| "auto_scale": true | |
| } | |
| }, | |
| "latent_dim": 256, | |
| "downsampling_ratio": 4096, | |
| "io_channels": 2 | |
| } | |
| }, | |
| "conditioning": { | |
| "configs": [ | |
| { | |
| "id": "prompt", | |
| "type": "t5gemma", | |
| "config": { | |
| "max_length": 256, | |
| "padding_mode": "learned", | |
| "repo_id": "cocktailpeanut/stable-audio-3-medium", | |
| "subfolder": "t5gemma-b-b-ul2" | |
| } | |
| }, | |
| { | |
| "id": "seconds_total", | |
| "type": "number", | |
| "config": { | |
| "min_val": 0, | |
| "max_val": 384, | |
| "fourier_features_type": "expo" | |
| } | |
| } | |
| ], | |
| "cond_dim": 768 | |
| }, | |
| "diffusion": { | |
| "cross_attention_cond_ids": [ | |
| "prompt", | |
| "seconds_total" | |
| ], | |
| "global_cond_ids": [ | |
| "seconds_total" | |
| ], | |
| "local_add_cond_ids": [ | |
| "inpaint_mask", | |
| "inpaint_masked_input" | |
| ], | |
| "type": "dit", | |
| "diffusion_objective": "rf_denoiser", | |
| "mask_padding_attention": true, | |
| "use_effective_length_for_schedule": true, | |
| "distribution_shift_options": { | |
| "min_length": 256, | |
| "max_length": 4096 | |
| }, | |
| "config": { | |
| "io_channels": 256, | |
| "embed_dim": 1536, | |
| "depth": 24, | |
| "num_heads": 24, | |
| "cond_token_dim": 768, | |
| "global_cond_dim": 768, | |
| "local_add_cond_dim": 257, | |
| "global_cond_type": "adaLN", | |
| "timestep_features_type": "expo", | |
| "attn_kwargs": { | |
| "qk_norm": "rms", | |
| "differential": true | |
| }, | |
| "norm_type": "rms_norm", | |
| "norm_kwargs": { | |
| "force_fp32": true | |
| }, | |
| "ff_kwargs": { | |
| "mult": 4.0 | |
| }, | |
| "num_memory_tokens": 64 | |
| } | |
| }, | |
| "io_channels": 256 | |
| }, | |
| "training": { | |
| "use_ema": true, | |
| "log_loss_info": false, | |
| "pre_encoded": true, | |
| "ot_coupling": true, | |
| "silence_extension_scale_seconds": 4.0, | |
| "timestep_sampler": "trunc_logit_normal", | |
| "mask_loss_weight": 1.0, | |
| "cfg_dropout_prob": 0.1, | |
| "inpainting": { | |
| "mask_kwargs": { | |
| "mask_type_probabilities": [ | |
| 0.1, | |
| 0.8, | |
| 0.1 | |
| ] | |
| } | |
| }, | |
| "arc": { | |
| "noise_dist": { | |
| "generator": "trunc_logit_normal", | |
| "discriminator": "logit_normal" | |
| }, | |
| "disc_update_interval": 2, | |
| "use_model_as_discriminator": true, | |
| "discriminator_base_ckpt": "/path/to/discriminator/ckpt.pt", | |
| "discriminator": { | |
| "type": "dilated_conv", | |
| "dit_hidden_layer": [ | |
| 18 | |
| ], | |
| "weights": { | |
| "generator": 1.0, | |
| "discriminator": 1.0 | |
| }, | |
| "reset_every": 250, | |
| "loss_type": "relativistic", | |
| "config": { | |
| "hidden_dim": 1024, | |
| "dilations": [ | |
| 1, | |
| 1, | |
| 1, | |
| 1, | |
| 1 | |
| ] | |
| }, | |
| "disc_hinge_loss": false, | |
| "contrastive": true, | |
| "include_grad_penalties": false | |
| } | |
| }, | |
| "optimizer_configs": { | |
| "diffusion": { | |
| "optimizer": { | |
| "type": "MuonAdamW", | |
| "config": { | |
| "muon_lr": 1e-05, | |
| "muon_momentum": 0.95, | |
| "adam_lr": 1e-06, | |
| "adam_betas": [ | |
| 0.9, | |
| 0.95 | |
| ], | |
| "adam_weight_decay": 0.01, | |
| "fused_layer_patterns": [ | |
| "*.to_qkv.*", | |
| "*.to_kv.*", | |
| "*.to_q.*", | |
| "*.ff.*.proj.*" | |
| ] | |
| } | |
| }, | |
| "scheduler": { | |
| "type": "InverseLR", | |
| "config": { | |
| "inv_gamma": 1000000, | |
| "power": 0.5, | |
| "warmup": 0.95 | |
| } | |
| } | |
| }, | |
| "discriminator": { | |
| "optimizer": { | |
| "type": "MuonAdamW", | |
| "config": { | |
| "muon_lr": 1e-05, | |
| "muon_momentum": 0.95, | |
| "adam_lr": 1e-06, | |
| "adam_betas": [ | |
| 0.9, | |
| 0.95 | |
| ], | |
| "adam_weight_decay": 0.01, | |
| "fused_layer_patterns": [ | |
| "*.to_qkv.*", | |
| "*.to_kv.*", | |
| "*.to_q.*", | |
| "*.ff.*.proj.*" | |
| ] | |
| } | |
| }, | |
| "scheduler": { | |
| "type": "InverseLR", | |
| "config": { | |
| "inv_gamma": 1000000, | |
| "power": 0.5, | |
| "warmup": 0.9 | |
| } | |
| } | |
| } | |
| }, | |
| "demo": { | |
| "demo_every": 500, | |
| "demo_steps": 8, | |
| "num_demos": 2, | |
| "demo_cond": [ | |
| { | |
| "prompt": "Meditative lo-fi ambient piano jazz, soft acoustic drum kit", | |
| "seconds_total": 190 | |
| }, | |
| { | |
| "prompt": "A tropical house track with upbeat melodies, a driving bassline, and cheery vibes", | |
| "seconds_total": 180 | |
| } | |
| ], | |
| "demo_cfg_scales": [ | |
| 1 | |
| ] | |
| } | |
| } | |
| } |