zhangj1an commited on
Commit
b79aa87
·
verified ·
1 Parent(s): 42e017d

Upload folder using huggingface_hub

Browse files
README.md ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ tags:
4
+ - vllm-omni
5
+ - audiox
6
+ - test-fixture
7
+ ---
8
+
9
+ # AudioX random / test fixture
10
+
11
+ A tiny **random-init** bundle of [vLLM-Omni](https://github.com/vllm-project/vllm-omni)'s
12
+ `AudioXPipeline`. Used by the L1/L2 `core_model` CI tests
13
+ (`tests/e2e/offline_inference/test_audiox_model.py`,
14
+ `tests/e2e/online_serving/test_audiox_online.py`) so they can verify the full
15
+ pipeline (load → forward → trim → return numpy WAV) end-to-end without paying
16
+ the cost of the real ~11 GB checkpoint.
17
+
18
+ It follows the same `config.json` schema as
19
+ [`zhangj1an/AudioX`](https://huggingface.co/zhangj1an/AudioX), but with much
20
+ smaller transformer dimensions:
21
+
22
+ - `embed_dim`: 1536 → 384
23
+ - `depth`: 24 → 4
24
+ - `num_heads`: 24 → 6
25
+ - `gate_type_config.num_experts_per_modality`: 64 → 16
26
+ - `gate_type_config.num_fusion_layers`: 8 → 2
27
+ - `sample_size`: 485100 → 483328 (still gives `latent_len = sample_size // 2048 = 236`,
28
+ matching the transformer's RoPE precompute)
29
+
30
+ All weights are random, fp16, generated by running the `AudioXPipeline.__init__`
31
+ with the small config and dumping its `state_dict()` with the bundle's legacy
32
+ naming convention. **Do not use for actual generation** — outputs are noise.
config.json ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "diffusion_cond",
3
+ "sample_size": 483328,
4
+ "sample_rate": 44100,
5
+ "video_fps": 5,
6
+ "audio_channels": 2,
7
+ "model": {
8
+ "pretransform": {
9
+ "type": "autoencoder",
10
+ "iterate_batch": true,
11
+ "config": {
12
+ "encoder": {
13
+ "type": "oobleck",
14
+ "requires_grad": false,
15
+ "config": {
16
+ "in_channels": 2,
17
+ "channels": 128,
18
+ "c_mults": [
19
+ 1,
20
+ 2,
21
+ 4,
22
+ 8,
23
+ 16
24
+ ],
25
+ "strides": [
26
+ 2,
27
+ 4,
28
+ 4,
29
+ 8,
30
+ 8
31
+ ],
32
+ "latent_dim": 128,
33
+ "use_snake": true
34
+ }
35
+ },
36
+ "decoder": {
37
+ "type": "oobleck",
38
+ "config": {
39
+ "out_channels": 2,
40
+ "channels": 128,
41
+ "c_mults": [
42
+ 1,
43
+ 2,
44
+ 4,
45
+ 8,
46
+ 16
47
+ ],
48
+ "strides": [
49
+ 2,
50
+ 4,
51
+ 4,
52
+ 8,
53
+ 8
54
+ ],
55
+ "latent_dim": 64,
56
+ "use_snake": true,
57
+ "final_tanh": false
58
+ }
59
+ },
60
+ "bottleneck": {
61
+ "type": "vae"
62
+ },
63
+ "latent_dim": 64,
64
+ "downsampling_ratio": 2048,
65
+ "io_channels": 2
66
+ }
67
+ },
68
+ "conditioning": {
69
+ "configs": [
70
+ {
71
+ "id": "video_prompt",
72
+ "type": "clip-with-sync-w-empty-feat",
73
+ "config": {
74
+ "clip_model_name": "openai/clip-vit-base-patch32"
75
+ }
76
+ },
77
+ {
78
+ "id": "text_prompt",
79
+ "type": "t5",
80
+ "config": {
81
+ "t5_model_name": "t5-base",
82
+ "max_length": 128
83
+ }
84
+ },
85
+ {
86
+ "id": "audio_prompt",
87
+ "type": "audio_autoencoder_v2",
88
+ "config": {
89
+ "sample_rate": 44100,
90
+ "pretransform_config": {
91
+ "type": "autoencoder",
92
+ "iterate_batch": true,
93
+ "config": {
94
+ "encoder": {
95
+ "type": "oobleck",
96
+ "requires_grad": false,
97
+ "config": {
98
+ "in_channels": 2,
99
+ "channels": 128,
100
+ "c_mults": [
101
+ 1,
102
+ 2,
103
+ 4,
104
+ 8,
105
+ 16
106
+ ],
107
+ "strides": [
108
+ 2,
109
+ 4,
110
+ 4,
111
+ 8,
112
+ 8
113
+ ],
114
+ "latent_dim": 128,
115
+ "use_snake": true
116
+ }
117
+ },
118
+ "decoder": {
119
+ "type": "oobleck",
120
+ "config": {
121
+ "out_channels": 2,
122
+ "channels": 128,
123
+ "c_mults": [
124
+ 1,
125
+ 2,
126
+ 4,
127
+ 8,
128
+ 16
129
+ ],
130
+ "strides": [
131
+ 2,
132
+ 4,
133
+ 4,
134
+ 8,
135
+ 8
136
+ ],
137
+ "latent_dim": 64,
138
+ "use_snake": true,
139
+ "final_tanh": false
140
+ }
141
+ },
142
+ "bottleneck": {
143
+ "type": "vae"
144
+ },
145
+ "latent_dim": 64,
146
+ "downsampling_ratio": 2048,
147
+ "io_channels": 2
148
+ }
149
+ },
150
+ "pretransform_ckpt_path": "./model/VAE.ckpt",
151
+ "latent_seq_len": 50,
152
+ "mask_ratio_start": 0,
153
+ "mask_ratio_end": 0
154
+ }
155
+ }
156
+ ],
157
+ "cond_dim": 768
158
+ },
159
+ "diffusion": {
160
+ "cross_attention_cond_ids": [
161
+ "video_prompt",
162
+ "text_prompt",
163
+ "audio_prompt"
164
+ ],
165
+ "global_cond_ids": [],
166
+ "type": "mmdit",
167
+ "gate": true,
168
+ "gate_type": "MAF",
169
+ "gate_type_config": {
170
+ "num_experts_per_modality": 16,
171
+ "num_heads": 6,
172
+ "num_fusion_layers": 2
173
+ },
174
+ "config": {
175
+ "io_channels": 64,
176
+ "embed_dim": 384,
177
+ "depth": 4,
178
+ "num_heads": 6,
179
+ "cond_token_dim": 768,
180
+ "global_cond_dim": 768,
181
+ "project_cond_tokens": false,
182
+ "transformer_type": "continuous_transformer",
183
+ "video_fps": 5
184
+ }
185
+ },
186
+ "io_channels": 64
187
+ }
188
+ }
model_index.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "_class_name": "AudioXPipeline"
3
+ }
transformer/config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {}
transformer/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7f1106d724ad39bbe6ee84786846c6b9f1ab54398ea0fc6dc860b4ad878ed02
3
+ size 1220352616