File size: 5,602 Bytes
c03e58d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
{
    "model_type": "diffusion_cond",
    "sample_size": 882000,
    "sample_rate": 44100,
    "audio_channels": 2,
    "model": {
        "pretransform": {
            "type": "autoencoder",
            "iterate_batch": true,
            "config": {
                "encoder": {
                    "type": "oobleck",
                    "requires_grad": false,
                    "config": {
                        "in_channels": 2,
                        "channels": 128,
                        "c_mults": [1, 2, 4, 8, 16],
                        "strides": [2, 4, 4, 8, 8],
                        "latent_dim": 128,
                        "use_snake": true
                    }
                },
                "decoder": {
                    "type": "oobleck",
                    "config": {
                        "out_channels": 2,
                        "channels": 128,
                        "c_mults": [1, 2, 4, 8, 16],
                        "strides": [2, 4, 4, 8, 8],
                        "latent_dim": 64,
                        "use_snake": true,
                        "final_tanh": false
                    }
                },
                "bottleneck": {
                    "type": "vae"
                },
                "latent_dim": 64,
                "downsampling_ratio": 2048,
                "io_channels": 2
            }
        },
        "conditioning": {
            "configs": [
                {
                    "id": "prompt",
                    "type": "t5",
                    "config": {
                        "t5_model_name": "t5-base",
                        "max_length": 128
                    }
                },
                {
                    "id": "seconds_start",
                    "type": "number",
                    "config": {
                        "min_val": 0,
                        "max_val": 512
                    }
                },
                {
                    "id": "seconds_total",
                    "type": "number",
                    "config": {
                        "min_val": 0,
                        "max_val": 512
                    }
                }
            ],
            "cond_dim": 768
        },
        "diffusion": {
            "cross_attention_cond_ids": ["prompt", "seconds_start", "seconds_total"],
            "global_cond_ids": ["seconds_start", "seconds_total"],
            "type": "dit",
            "config": {
                "io_channels": 64,
                "embed_dim": 1536,
                "depth": 24,
                "num_heads": 24,
                "cond_token_dim": 768,
                "global_cond_dim": 1536,
                "project_cond_tokens": false,
                "transformer_type": "continuous_transformer"
            }
        },
        "io_channels": 64
    },
    "training": {
        "use_ema": true,
        "log_loss_info": false,
        "optimizer_configs": {
            "diffusion": {
                "optimizer": {
                    "type": "AdamW",
                    "config": {
                        "lr": 5e-5,
                        "betas": [0.9, 0.999],
                        "weight_decay": 1e-3
                    }
                },
                "scheduler": {
                    "type": "InverseLR",
                    "config": {
                        "inv_gamma": 1000000,
                        "power": 0.5,
                        "warmup": 0.99
                    }
                }
            }
        },
        "demo": {
            "demo_every": 14784,
            "demo_steps": 250,
            "num_demos": 10,
            "demo_cond": [
                {"prompt": "Guitar, Steel Guitar, Mids, Upper Mids, Highs, Pluck, Bright, Clean, simple melody, D minor, 100 BPM, 8 bars", "seconds_start": 0, "seconds_total": 19},
                {"prompt": "Guitar, Steel Guitar, Mids, Upper Mids, Highs, Pluck, Bright, Clean, simple melody, D minor, 100 BPM, 8 bars", "seconds_start": 0, "seconds_total": 19},
                {"prompt": "Bowed Strings, Violin, Cello, Mids, Upper Mids, Staccato, Tight, Focused, Smooth, Present, Near, Ensemble, Wet, catchy dance chord progression, with top dance melody, C# minor, 128 BPM, 8 bars", "seconds_start": 0, "seconds_total": 15},
                {"prompt": "Vocal, Synthetic, Bass, Mids, Upper Mids, Highs, Digital, Retro, Synthetic Vox, D major, 140 BPM, 8 bars", "seconds_start": 0, "seconds_total": 14},
                {"prompt": "Keys, Grand Piano, Highs, Full, Subdued, Wet, Medium Reverb, Medium Phaser, medium speed, off beat, repeating, melody, C minor, 128 BPM, 8 bars", "seconds_start": 0, "seconds_total": 15},
                {"prompt": "Bass, FM Bass, Sub Bass, Bass, Upper Mids, Highs, Thick, Clean, Pitch Bend, Wet, Medium Delay, Medium Reverb, Low Distortion, Phaser, 4 bars, 150 BPM, D minor", "seconds_start": 0, "seconds_total": 6},
                {"prompt": "Wind, World Winds, Flute, Airy, Hollow,Bb major, 110 BPM, 4 bars", "seconds_start": 0, "seconds_total": 8},
                {"prompt": "Bass, Wavetable Bass, Sub Bass, Bass, Upper Mids, Highs, Acid, 8 bars, 128 BPM, E minor", "seconds_start": 0, "seconds_total": 15},
                {"prompt": "Ocarina, Formant Vocal, Warm, Rich, Clean, Medium Reverb F minor, 128 BPM, 8 bars", "seconds_start": 0, "seconds_total": 15},
                {"prompt": "Ocarina, Formant Vocal, Warm, Rich, Clean, Medium Reverb F minor, 128 BPM, 8 bars", "seconds_start": 0, "seconds_total": 15}
            ],
            "demo_cfg_scales": [7]
        }
    }
}