Upload folder using huggingface_hub

Browse files

Files changed (5) hide show

README.md +32 -0
config.json +188 -0
model_index.json +3 -0
transformer/config.json +1 -0
transformer/diffusion_pytorch_model.safetensors +3 -0

README.md ADDED Viewed

	@@ -0,0 +1,32 @@

+---
+license: apache-2.0
+tags:
+  - vllm-omni
+  - audiox
+  - test-fixture
+---
+# AudioX random / test fixture
+A tiny **random-init** bundle of [vLLM-Omni](https://github.com/vllm-project/vllm-omni)'s
+`AudioXPipeline`. Used by the L1/L2 `core_model` CI tests
+(`tests/e2e/offline_inference/test_audiox_model.py`,
+`tests/e2e/online_serving/test_audiox_online.py`) so they can verify the full
+pipeline (load → forward → trim → return numpy WAV) end-to-end without paying
+the cost of the real ~11 GB checkpoint.
+It follows the same `config.json` schema as
+[`zhangj1an/AudioX`](https://huggingface.co/zhangj1an/AudioX), but with much
+smaller transformer dimensions:
+- `embed_dim`: 1536 → 384
+- `depth`: 24 → 4
+- `num_heads`: 24 → 6
+- `gate_type_config.num_experts_per_modality`: 64 → 16
+- `gate_type_config.num_fusion_layers`: 8 → 2
+- `sample_size`: 485100 → 483328 (still gives `latent_len = sample_size // 2048 = 236`,
+  matching the transformer's RoPE precompute)
+All weights are random, fp16, generated by running the `AudioXPipeline.__init__`
+with the small config and dumping its `state_dict()` with the bundle's legacy
+naming convention. **Do not use for actual generation** — outputs are noise.

config.json ADDED Viewed

	@@ -0,0 +1,188 @@

+{
+  "model_type": "diffusion_cond",
+  "sample_size": 483328,
+  "sample_rate": 44100,
+  "video_fps": 5,
+  "audio_channels": 2,
+  "model": {
+    "pretransform": {
+      "type": "autoencoder",
+      "iterate_batch": true,
+      "config": {
+        "encoder": {
+          "type": "oobleck",
+          "requires_grad": false,
+          "config": {
+            "in_channels": 2,
+            "channels": 128,
+            "c_mults": [
+              1,
+              2,
+              4,
+              8,
+              16
+            ],
+            "strides": [
+              2,
+              4,
+              4,
+              8,
+              8
+            ],
+            "latent_dim": 128,
+            "use_snake": true
+          }
+        },
+        "decoder": {
+          "type": "oobleck",
+          "config": {
+            "out_channels": 2,
+            "channels": 128,
+            "c_mults": [
+              1,
+              2,
+              4,
+              8,
+              16
+            ],
+            "strides": [
+              2,
+              4,
+              4,
+              8,
+              8
+            ],
+            "latent_dim": 64,
+            "use_snake": true,
+            "final_tanh": false
+          }
+        },
+        "bottleneck": {
+          "type": "vae"
+        },
+        "latent_dim": 64,
+        "downsampling_ratio": 2048,
+        "io_channels": 2
+      }
+    },
+    "conditioning": {
+      "configs": [
+        {
+          "id": "video_prompt",
+          "type": "clip-with-sync-w-empty-feat",
+          "config": {
+            "clip_model_name": "openai/clip-vit-base-patch32"
+          }
+        },
+        {
+          "id": "text_prompt",
+          "type": "t5",
+          "config": {
+            "t5_model_name": "t5-base",
+            "max_length": 128
+          }
+        },
+        {
+          "id": "audio_prompt",
+          "type": "audio_autoencoder_v2",
+          "config": {
+            "sample_rate": 44100,
+            "pretransform_config": {
+              "type": "autoencoder",
+              "iterate_batch": true,
+              "config": {
+                "encoder": {
+                  "type": "oobleck",
+                  "requires_grad": false,
+                  "config": {
+                    "in_channels": 2,
+                    "channels": 128,
+                    "c_mults": [
+                      1,
+                      2,
+                      4,
+                      8,
+                      16
+                    ],
+                    "strides": [
+                      2,
+                      4,
+                      4,
+                      8,
+                      8
+                    ],
+                    "latent_dim": 128,
+                    "use_snake": true
+                  }
+                },
+                "decoder": {
+                  "type": "oobleck",
+                  "config": {
+                    "out_channels": 2,
+                    "channels": 128,
+                    "c_mults": [
+                      1,
+                      2,
+                      4,
+                      8,
+                      16
+                    ],
+                    "strides": [
+                      2,
+                      4,
+                      4,
+                      8,
+                      8
+                    ],
+                    "latent_dim": 64,
+                    "use_snake": true,
+                    "final_tanh": false
+                  }
+                },
+                "bottleneck": {
+                  "type": "vae"
+                },
+                "latent_dim": 64,
+                "downsampling_ratio": 2048,
+                "io_channels": 2
+              }
+            },
+            "pretransform_ckpt_path": "./model/VAE.ckpt",
+            "latent_seq_len": 50,
+            "mask_ratio_start": 0,
+            "mask_ratio_end": 0
+          }
+        }
+      ],
+      "cond_dim": 768
+    },
+    "diffusion": {
+      "cross_attention_cond_ids": [
+        "video_prompt",
+        "text_prompt",
+        "audio_prompt"
+      ],
+      "global_cond_ids": [],
+      "type": "mmdit",
+      "gate": true,
+      "gate_type": "MAF",
+      "gate_type_config": {
+        "num_experts_per_modality": 16,
+        "num_heads": 6,
+        "num_fusion_layers": 2
+      },
+      "config": {
+        "io_channels": 64,
+        "embed_dim": 384,
+        "depth": 4,
+        "num_heads": 6,
+        "cond_token_dim": 768,
+        "global_cond_dim": 768,
+        "project_cond_tokens": false,
+        "transformer_type": "continuous_transformer",
+        "video_fps": 5
+      }
+    },
+    "io_channels": 64
+  }
+}

model_index.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "_class_name": "AudioXPipeline"
+}

transformer/config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {}

transformer/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c7f1106d724ad39bbe6ee84786846c6b9f1ab54398ea0fc6dc860b4ad878ed02
+size 1220352616