Zeyue7 commited on
Commit
ba51740
·
1 Parent(s): 11642fb

upload audiox-maf

Browse files
Files changed (2) hide show
  1. config.json +137 -0
  2. model.ckpt +3 -0
config.json ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "diffusion_cond",
3
+ "sample_size": 485100,
4
+ "sample_rate": 44100,
5
+ "video_fps": 5,
6
+ "audio_channels": 2,
7
+ "model": {
8
+ "pretransform": {
9
+ "type": "autoencoder",
10
+ "iterate_batch": true,
11
+ "config": {
12
+ "encoder": {
13
+ "type": "oobleck",
14
+ "requires_grad": false,
15
+ "config": {
16
+ "in_channels": 2,
17
+ "channels": 128,
18
+ "c_mults": [1, 2, 4, 8, 16],
19
+ "strides": [2, 4, 4, 8, 8],
20
+ "latent_dim": 128,
21
+ "use_snake": true
22
+ }
23
+ },
24
+ "decoder": {
25
+ "type": "oobleck",
26
+ "config": {
27
+ "out_channels": 2,
28
+ "channels": 128,
29
+ "c_mults": [1, 2, 4, 8, 16],
30
+ "strides": [2, 4, 4, 8, 8],
31
+ "latent_dim": 64,
32
+ "use_snake": true,
33
+ "final_tanh": false
34
+ }
35
+ },
36
+ "bottleneck": {
37
+ "type": "vae"
38
+ },
39
+ "latent_dim": 64,
40
+ "downsampling_ratio": 2048,
41
+ "io_channels": 2
42
+ }
43
+ },
44
+ "conditioning": {
45
+ "configs": [
46
+ {
47
+ "id": "video_prompt",
48
+ "type": "clip-with-sync-w-empty-feat",
49
+ "config": {
50
+ "clip_model_name": "clip-vit-base-patch32"
51
+ }
52
+ },
53
+ {
54
+ "id": "text_prompt",
55
+ "type": "t5",
56
+ "config": {
57
+ "t5_model_name": "t5-base",
58
+ "max_length": 128
59
+ }
60
+ },
61
+ {
62
+ "id": "audio_prompt",
63
+ "type": "mel_spec",
64
+ "config": {
65
+ "mel_spec_type": "mel_features",
66
+ "n_fft": 1024,
67
+ "hop_length": 256,
68
+ "win_length": 1024,
69
+ "n_mel_channels": 256,
70
+ "target_sample_rate": 24000
71
+ }
72
+ }
73
+ ],
74
+ "cond_dim": 768
75
+ },
76
+ "diffusion": {
77
+ "cross_attention_cond_ids": ["video_prompt", "text_prompt", "audio_prompt"],
78
+ "global_cond_ids": [],
79
+ "type": "dit",
80
+ "gate": true,
81
+ "gate_type": "MAF",
82
+ "gate_type_config": {
83
+ "num_experts_per_modality": 64,
84
+ "num_heads": 24,
85
+ "num_fusion_layers": 8
86
+ },
87
+ "config": {
88
+ "io_channels": 64,
89
+ "embed_dim": 1536,
90
+ "depth": 24,
91
+ "num_heads": 24,
92
+ "cond_token_dim": 768,
93
+ "global_cond_dim": 768,
94
+ "project_cond_tokens": false,
95
+ "transformer_type": "continuous_transformer",
96
+ "video_fps": 5
97
+ }
98
+ },
99
+ "io_channels": 64
100
+ },
101
+ "training": {
102
+ "use_ema": true,
103
+ "log_loss_info": false,
104
+ "optimizer_configs": {
105
+ "diffusion": {
106
+ "optimizer": {
107
+ "type": "AdamW",
108
+ "config": {
109
+ "lr": 5e-5,
110
+ "betas": [0.9, 0.999],
111
+ "weight_decay": 1e-3
112
+ }
113
+ },
114
+ "scheduler": {
115
+ "type": "InverseLR",
116
+ "config": {
117
+ "inv_gamma": 1000000,
118
+ "power": 0.5,
119
+ "warmup": 0.99
120
+ }
121
+ }
122
+ }
123
+ },
124
+ "demo": {
125
+ "demo_every": 2000,
126
+ "demo_steps": 250,
127
+ "num_demos": 4,
128
+ "demo_cond": [
129
+ {"prompt": "Amen break 174 BPM", "seconds_start": 0, "seconds_total": 12},
130
+ {"prompt": "A beautiful orchestral symphony, classical music", "seconds_start": 0, "seconds_total": 160},
131
+ {"prompt": "Chill hip-hop beat, chillhop", "seconds_start": 0, "seconds_total": 190},
132
+ {"prompt": "A pop song about love and loss", "seconds_start": 0, "seconds_total": 180}
133
+ ],
134
+ "demo_cfg_scales": [3, 6, 9]
135
+ }
136
+ }
137
+ }
model.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b59a3b6d3e12bec0bd26b0e5ecae3035f5bccf6d88f839a5608f840e07a208d
3
+ size 5572044828