zhangj1an
/

audiox_random

Model card Files Files and versions

audiox_random / config.json

zhangj1an's picture

Upload folder using huggingface_hub

b79aa87 verified 17 days ago

history blame contribute delete

4.61 kB

	{
	"model_type": "diffusion_cond",
	"sample_size": 483328,
	"sample_rate": 44100,
	"video_fps": 5,
	"audio_channels": 2,
	"model": {
	"pretransform": {
	"type": "autoencoder",
	"iterate_batch": true,
	"config": {
	"encoder": {
	"type": "oobleck",
	"requires_grad": false,
	"config": {
	"in_channels": 2,
	"channels": 128,
	"c_mults": [
	1,
	2,
	4,
	8,
	16
	],
	"strides": [
	2,
	4,
	4,
	8,
	8
	],
	"latent_dim": 128,
	"use_snake": true
	}
	},
	"decoder": {
	"type": "oobleck",
	"config": {
	"out_channels": 2,
	"channels": 128,
	"c_mults": [
	1,
	2,
	4,
	8,
	16
	],
	"strides": [
	2,
	4,
	4,
	8,
	8
	],
	"latent_dim": 64,
	"use_snake": true,
	"final_tanh": false
	}
	},
	"bottleneck": {
	"type": "vae"
	},
	"latent_dim": 64,
	"downsampling_ratio": 2048,
	"io_channels": 2
	}
	},
	"conditioning": {
	"configs": [
	{
	"id": "video_prompt",
	"type": "clip-with-sync-w-empty-feat",
	"config": {
	"clip_model_name": "openai/clip-vit-base-patch32"
	}
	},
	{
	"id": "text_prompt",
	"type": "t5",
	"config": {
	"t5_model_name": "t5-base",
	"max_length": 128
	}
	},
	{
	"id": "audio_prompt",
	"type": "audio_autoencoder_v2",
	"config": {
	"sample_rate": 44100,
	"pretransform_config": {
	"type": "autoencoder",
	"iterate_batch": true,
	"config": {
	"encoder": {
	"type": "oobleck",
	"requires_grad": false,
	"config": {
	"in_channels": 2,
	"channels": 128,
	"c_mults": [
	1,
	2,
	4,
	8,
	16
	],
	"strides": [
	2,
	4,
	4,
	8,
	8
	],
	"latent_dim": 128,
	"use_snake": true
	}
	},
	"decoder": {
	"type": "oobleck",
	"config": {
	"out_channels": 2,
	"channels": 128,
	"c_mults": [
	1,
	2,
	4,
	8,
	16
	],
	"strides": [
	2,
	4,
	4,
	8,
	8
	],
	"latent_dim": 64,
	"use_snake": true,
	"final_tanh": false
	}
	},
	"bottleneck": {
	"type": "vae"
	},
	"latent_dim": 64,
	"downsampling_ratio": 2048,
	"io_channels": 2
	}
	},
	"pretransform_ckpt_path": "./model/VAE.ckpt",
	"latent_seq_len": 50,
	"mask_ratio_start": 0,
	"mask_ratio_end": 0
	}
	}
	],
	"cond_dim": 768
	},
	"diffusion": {
	"cross_attention_cond_ids": [
	"video_prompt",
	"text_prompt",
	"audio_prompt"
	],
	"global_cond_ids": [],
	"type": "mmdit",
	"gate": true,
	"gate_type": "MAF",
	"gate_type_config": {
	"num_experts_per_modality": 16,
	"num_heads": 6,
	"num_fusion_layers": 2
	},
	"config": {
	"io_channels": 64,
	"embed_dim": 384,
	"depth": 4,
	"num_heads": 6,
	"cond_token_dim": 768,
	"global_cond_dim": 768,
	"project_cond_tokens": false,
	"transformer_type": "continuous_transformer",
	"video_fps": 5
	}
	},
	"io_channels": 64
	}
	}