upload_audiox-mmdit

Browse files

Files changed (3) hide show

VAE.ckpt +3 -0
config.json +136 -0
model.ckpt +3 -0

VAE.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:02e8a84bd5c1ee8a812609b03286ec85b856cb3ee8cd607083563de67347e621
+size 624540628

config.json ADDED Viewed

	@@ -0,0 +1,136 @@

+{
+    "model_type": "diffusion_cond",
+    "sample_size": 485100,
+    "sample_rate": 44100,
+    "video_fps": 5,
+    "audio_channels": 2,
+    "model": {
+        "pretransform": {
+            "type": "autoencoder",
+            "iterate_batch": true,
+            "config": {
+                "encoder": {
+                    "type": "oobleck",
+                    "requires_grad": false,
+                    "config": {
+                        "in_channels": 2,
+                        "channels": 128,
+                        "c_mults": [1, 2, 4, 8, 16],
+                        "strides": [2, 4, 4, 8, 8],
+                        "latent_dim": 128,
+                        "use_snake": true
+                    }
+                },
+                "decoder": {
+                    "type": "oobleck",
+                    "config": {
+                        "out_channels": 2,
+                        "channels": 128,
+                        "c_mults": [1, 2, 4, 8, 16],
+                        "strides": [2, 4, 4, 8, 8],
+                        "latent_dim": 64,
+                        "use_snake": true,
+                        "final_tanh": false
+                    }
+                },
+                "bottleneck": {
+                    "type": "vae"
+                },
+                "latent_dim": 64,
+                "downsampling_ratio": 2048,
+                "io_channels": 2
+            }
+        },
+        "conditioning": {
+            "configs": [
+                {
+                    "id": "video_prompt",
+                    "type": "clip-with-sync-w-empty-feat",
+                    "config": {
+                        "clip_model_name": "clip-vit-base-patch32"
+                    }
+                },
+                {
+                    "id": "text_prompt",
+                    "type": "t5",
+                    "config": {
+                        "t5_model_name": "t5-base",
+                        "max_length": 128
+                    }
+                },
+                {
+                    "id": "audio_prompt",
+                    "type": "audio_autoencoder_v2",
+                    "config": {
+                        "sample_rate": 44100,
+                        "pretransform_config": {
+                            "type": "autoencoder",
+                            "iterate_batch": true,
+                            "config": {
+                                "encoder": {
+                                    "type": "oobleck",
+                                    "requires_grad": false,
+                                    "config": {
+                                        "in_channels": 2,
+                                        "channels": 128,
+                                        "c_mults": [1, 2, 4, 8, 16],
+                                        "strides": [2, 4, 4, 8, 8],
+                                        "latent_dim": 128,
+                                        "use_snake": true
+                                    }
+                                },
+                                "decoder": {
+                                    "type": "oobleck",
+                                    "config": {
+                                        "out_channels": 2,
+                                        "channels": 128,
+                                        "c_mults": [1, 2, 4, 8, 16],
+                                        "strides": [2, 4, 4, 8, 8],
+                                        "latent_dim": 64,
+                                        "use_snake": true,
+                                        "final_tanh": false
+                                    }
+                                },
+                                "bottleneck": {
+                                    "type": "vae"
+                                },
+                                "latent_dim": 64,
+                                "downsampling_ratio": 2048,
+                                "io_channels": 2
+                            }
+                        },
+                        "pretransform_ckpt_path": "./model/VAE.ckpt",
+                        "latent_seq_len": 215,
+                        "mask_ratio_start": 0,
+                        "mask_ratio_end": 0
+                    }
+                }
+            ],
+            "cond_dim": 768
+        },
+        "diffusion": {
+            "cross_attention_cond_ids": ["video_prompt", "text_prompt", "audio_prompt"],
+            "global_cond_ids": [],
+            "type": "mmdit",
+            "gate": true,
+            "gate_type": "MAF",
+            "gate_type_config": {
+                "num_experts_per_modality": 64,
+                "num_heads": 24,
+                "num_fusion_layers": 8
+            },
+            "config": {
+                "io_channels": 64,
+                "embed_dim": 1536,
+                "depth": 24,
+                "num_heads": 24,
+                "cond_token_dim": 768,
+                "global_cond_dim": 768,
+                "project_cond_tokens": false,
+                "transformer_type": "continuous_transformer",
+                "video_fps": 5
+            }
+        },
+        "io_channels": 64
+    }
+}

model.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b20ef15c59f33d7daae3601cee32c61212cf256edc7576c2c5db9390299aa66c
+size 10843616882