Zeyue7 commited on
Commit
3c7224a
·
1 Parent(s): d5be467

upload_audiox-mmdit

Browse files
Files changed (3) hide show
  1. VAE.ckpt +3 -0
  2. config.json +136 -0
  3. model.ckpt +3 -0
VAE.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02e8a84bd5c1ee8a812609b03286ec85b856cb3ee8cd607083563de67347e621
3
+ size 624540628
config.json ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "diffusion_cond",
3
+ "sample_size": 485100,
4
+ "sample_rate": 44100,
5
+ "video_fps": 5,
6
+ "audio_channels": 2,
7
+ "model": {
8
+ "pretransform": {
9
+ "type": "autoencoder",
10
+ "iterate_batch": true,
11
+ "config": {
12
+ "encoder": {
13
+ "type": "oobleck",
14
+ "requires_grad": false,
15
+ "config": {
16
+ "in_channels": 2,
17
+ "channels": 128,
18
+ "c_mults": [1, 2, 4, 8, 16],
19
+ "strides": [2, 4, 4, 8, 8],
20
+ "latent_dim": 128,
21
+ "use_snake": true
22
+ }
23
+ },
24
+ "decoder": {
25
+ "type": "oobleck",
26
+ "config": {
27
+ "out_channels": 2,
28
+ "channels": 128,
29
+ "c_mults": [1, 2, 4, 8, 16],
30
+ "strides": [2, 4, 4, 8, 8],
31
+ "latent_dim": 64,
32
+ "use_snake": true,
33
+ "final_tanh": false
34
+ }
35
+ },
36
+ "bottleneck": {
37
+ "type": "vae"
38
+ },
39
+ "latent_dim": 64,
40
+ "downsampling_ratio": 2048,
41
+ "io_channels": 2
42
+ }
43
+ },
44
+ "conditioning": {
45
+ "configs": [
46
+ {
47
+ "id": "video_prompt",
48
+ "type": "clip-with-sync-w-empty-feat",
49
+ "config": {
50
+ "clip_model_name": "clip-vit-base-patch32"
51
+ }
52
+ },
53
+ {
54
+ "id": "text_prompt",
55
+ "type": "t5",
56
+ "config": {
57
+ "t5_model_name": "t5-base",
58
+ "max_length": 128
59
+ }
60
+ },
61
+ {
62
+ "id": "audio_prompt",
63
+ "type": "audio_autoencoder_v2",
64
+ "config": {
65
+ "sample_rate": 44100,
66
+ "pretransform_config": {
67
+ "type": "autoencoder",
68
+ "iterate_batch": true,
69
+ "config": {
70
+ "encoder": {
71
+ "type": "oobleck",
72
+ "requires_grad": false,
73
+ "config": {
74
+ "in_channels": 2,
75
+ "channels": 128,
76
+ "c_mults": [1, 2, 4, 8, 16],
77
+ "strides": [2, 4, 4, 8, 8],
78
+ "latent_dim": 128,
79
+ "use_snake": true
80
+ }
81
+ },
82
+ "decoder": {
83
+ "type": "oobleck",
84
+ "config": {
85
+ "out_channels": 2,
86
+ "channels": 128,
87
+ "c_mults": [1, 2, 4, 8, 16],
88
+ "strides": [2, 4, 4, 8, 8],
89
+ "latent_dim": 64,
90
+ "use_snake": true,
91
+ "final_tanh": false
92
+ }
93
+ },
94
+ "bottleneck": {
95
+ "type": "vae"
96
+ },
97
+ "latent_dim": 64,
98
+ "downsampling_ratio": 2048,
99
+ "io_channels": 2
100
+ }
101
+ },
102
+ "pretransform_ckpt_path": "./model/VAE.ckpt",
103
+ "latent_seq_len": 215,
104
+ "mask_ratio_start": 0,
105
+ "mask_ratio_end": 0
106
+ }
107
+ }
108
+ ],
109
+ "cond_dim": 768
110
+ },
111
+ "diffusion": {
112
+ "cross_attention_cond_ids": ["video_prompt", "text_prompt", "audio_prompt"],
113
+ "global_cond_ids": [],
114
+ "type": "mmdit",
115
+ "gate": true,
116
+ "gate_type": "MAF",
117
+ "gate_type_config": {
118
+ "num_experts_per_modality": 64,
119
+ "num_heads": 24,
120
+ "num_fusion_layers": 8
121
+ },
122
+ "config": {
123
+ "io_channels": 64,
124
+ "embed_dim": 1536,
125
+ "depth": 24,
126
+ "num_heads": 24,
127
+ "cond_token_dim": 768,
128
+ "global_cond_dim": 768,
129
+ "project_cond_tokens": false,
130
+ "transformer_type": "continuous_transformer",
131
+ "video_fps": 5
132
+ }
133
+ },
134
+ "io_channels": 64
135
+ }
136
+ }
model.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b20ef15c59f33d7daae3601cee32c61212cf256edc7576c2c5db9390299aa66c
3
+ size 10843616882