File size: 5,602 Bytes
c03e58d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 | {
"model_type": "diffusion_cond",
"sample_size": 882000,
"sample_rate": 44100,
"audio_channels": 2,
"model": {
"pretransform": {
"type": "autoencoder",
"iterate_batch": true,
"config": {
"encoder": {
"type": "oobleck",
"requires_grad": false,
"config": {
"in_channels": 2,
"channels": 128,
"c_mults": [1, 2, 4, 8, 16],
"strides": [2, 4, 4, 8, 8],
"latent_dim": 128,
"use_snake": true
}
},
"decoder": {
"type": "oobleck",
"config": {
"out_channels": 2,
"channels": 128,
"c_mults": [1, 2, 4, 8, 16],
"strides": [2, 4, 4, 8, 8],
"latent_dim": 64,
"use_snake": true,
"final_tanh": false
}
},
"bottleneck": {
"type": "vae"
},
"latent_dim": 64,
"downsampling_ratio": 2048,
"io_channels": 2
}
},
"conditioning": {
"configs": [
{
"id": "prompt",
"type": "t5",
"config": {
"t5_model_name": "t5-base",
"max_length": 128
}
},
{
"id": "seconds_start",
"type": "number",
"config": {
"min_val": 0,
"max_val": 512
}
},
{
"id": "seconds_total",
"type": "number",
"config": {
"min_val": 0,
"max_val": 512
}
}
],
"cond_dim": 768
},
"diffusion": {
"cross_attention_cond_ids": ["prompt", "seconds_start", "seconds_total"],
"global_cond_ids": ["seconds_start", "seconds_total"],
"type": "dit",
"config": {
"io_channels": 64,
"embed_dim": 1536,
"depth": 24,
"num_heads": 24,
"cond_token_dim": 768,
"global_cond_dim": 1536,
"project_cond_tokens": false,
"transformer_type": "continuous_transformer"
}
},
"io_channels": 64
},
"training": {
"use_ema": true,
"log_loss_info": false,
"optimizer_configs": {
"diffusion": {
"optimizer": {
"type": "AdamW",
"config": {
"lr": 5e-5,
"betas": [0.9, 0.999],
"weight_decay": 1e-3
}
},
"scheduler": {
"type": "InverseLR",
"config": {
"inv_gamma": 1000000,
"power": 0.5,
"warmup": 0.99
}
}
}
},
"demo": {
"demo_every": 14784,
"demo_steps": 250,
"num_demos": 10,
"demo_cond": [
{"prompt": "Guitar, Steel Guitar, Mids, Upper Mids, Highs, Pluck, Bright, Clean, simple melody, D minor, 100 BPM, 8 bars", "seconds_start": 0, "seconds_total": 19},
{"prompt": "Guitar, Steel Guitar, Mids, Upper Mids, Highs, Pluck, Bright, Clean, simple melody, D minor, 100 BPM, 8 bars", "seconds_start": 0, "seconds_total": 19},
{"prompt": "Bowed Strings, Violin, Cello, Mids, Upper Mids, Staccato, Tight, Focused, Smooth, Present, Near, Ensemble, Wet, catchy dance chord progression, with top dance melody, C# minor, 128 BPM, 8 bars", "seconds_start": 0, "seconds_total": 15},
{"prompt": "Vocal, Synthetic, Bass, Mids, Upper Mids, Highs, Digital, Retro, Synthetic Vox, D major, 140 BPM, 8 bars", "seconds_start": 0, "seconds_total": 14},
{"prompt": "Keys, Grand Piano, Highs, Full, Subdued, Wet, Medium Reverb, Medium Phaser, medium speed, off beat, repeating, melody, C minor, 128 BPM, 8 bars", "seconds_start": 0, "seconds_total": 15},
{"prompt": "Bass, FM Bass, Sub Bass, Bass, Upper Mids, Highs, Thick, Clean, Pitch Bend, Wet, Medium Delay, Medium Reverb, Low Distortion, Phaser, 4 bars, 150 BPM, D minor", "seconds_start": 0, "seconds_total": 6},
{"prompt": "Wind, World Winds, Flute, Airy, Hollow,Bb major, 110 BPM, 4 bars", "seconds_start": 0, "seconds_total": 8},
{"prompt": "Bass, Wavetable Bass, Sub Bass, Bass, Upper Mids, Highs, Acid, 8 bars, 128 BPM, E minor", "seconds_start": 0, "seconds_total": 15},
{"prompt": "Ocarina, Formant Vocal, Warm, Rich, Clean, Medium Reverb F minor, 128 BPM, 8 bars", "seconds_start": 0, "seconds_total": 15},
{"prompt": "Ocarina, Formant Vocal, Warm, Rich, Clean, Medium Reverb F minor, 128 BPM, 8 bars", "seconds_start": 0, "seconds_total": 15}
],
"demo_cfg_scales": [7]
}
}
} |