Spaces:
Runtime error
Runtime error
| { | |
| "model_type": "mm_diffusion_cond", | |
| "sample_size": 397312, | |
| "sample_rate": 44100, | |
| "audio_channels": 2, | |
| "model": { | |
| "pretransform": { | |
| "type": "autoencoder", | |
| "iterate_batch": true, | |
| "config": { | |
| "encoder": { | |
| "type": "oobleck", | |
| "config": { | |
| "in_channels": 2, | |
| "channels": 128, | |
| "c_mults": [1, 2, 4, 8, 16], | |
| "strides": [2, 4, 4, 8, 8], | |
| "latent_dim": 128, | |
| "use_snake": true | |
| } | |
| }, | |
| "decoder": { | |
| "type": "oobleck", | |
| "config": { | |
| "out_channels": 2, | |
| "channels": 128, | |
| "c_mults": [1, 2, 4, 8, 16], | |
| "strides": [2, 4, 4, 8, 8], | |
| "latent_dim": 64, | |
| "use_snake": true, | |
| "final_tanh": false | |
| } | |
| }, | |
| "bottleneck": { | |
| "type": "vae" | |
| }, | |
| "latent_dim": 64, | |
| "downsampling_ratio": 2048, | |
| "io_channels": 2 | |
| } | |
| }, | |
| "conditioning": { | |
| "configs": [ | |
| { | |
| "id": "metaclip_features", | |
| "type": "mm_unchang", | |
| "config": { | |
| "dim": 1024, | |
| "output_dim": 1024 | |
| } | |
| }, | |
| { | |
| "id": "metaclip_text_features", | |
| "type": "mm_unchang", | |
| "config": { | |
| "dim": 1024, | |
| "output_dim": 1024 | |
| } | |
| }, | |
| { | |
| "id": "sync_features", | |
| "type": "mm_unchang", | |
| "config": { | |
| "dim": 768, | |
| "output_dim": 768 | |
| } | |
| }, | |
| { | |
| "id": "t5_features", | |
| "type": "mm_unchang", | |
| "config": { | |
| "dim": 2048, | |
| "output_dim": 2048 | |
| } | |
| } | |
| ], | |
| "cond_dim": 768 | |
| }, | |
| "diffusion": { | |
| "mm_cond_ids": ["metaclip_features", "sync_features", "metaclip_text_features","t5_features"], | |
| "type": "mmdit", | |
| "diffusion_objective": "rectified_flow", | |
| "config": { | |
| "latent_dim":64, | |
| "clip_dim":1024, | |
| "sync_dim":768, | |
| "text_dim":2048, | |
| "hidden_dim":1024, | |
| "depth":21, | |
| "fused_depth":14, | |
| "num_heads":16, | |
| "latent_seq_len":194, | |
| "clip_seq_len":72, | |
| "sync_seq_len":216, | |
| "v2": true, | |
| "kernel_size": 3 | |
| } | |
| }, | |
| "io_channels": 64 | |
| }, | |
| "training": { | |
| "use_ema": true, | |
| "log_loss_info": false, | |
| "cfg_dropout_prob": 0.2, | |
| "pre_encoded": true, | |
| "timestep_sampler": "logit_normal", | |
| "optimizer_configs": { | |
| "diffusion": { | |
| "optimizer": { | |
| "type": "AdamW", | |
| "config": { | |
| "lr": 5e-5, | |
| "betas": [0.9, 0.95], | |
| "weight_decay": 1e-4, | |
| "eps": 1e-6 | |
| } | |
| }, | |
| "scheduler": { | |
| "type": "InverseLR", | |
| "config": { | |
| "inv_gamma": 1000000, | |
| "power": 0.5, | |
| "warmup": 0.99 | |
| } | |
| } | |
| } | |
| }, | |
| "demo": { | |
| "demo_every": 5000, | |
| "demo_steps": 24, | |
| "num_demos": 10, | |
| "demo_cond": [ | |
| "dataset/vggsound/video_latents_t5_clip_npz/test/0Cu33yBwAPg_000060.npz", | |
| "dataset/vggsound/video_latents_t5_clip_npz/test/bmKtI808DsU_000009.npz", | |
| "dataset/vggsound/video_latents_t5_clip_npz/test/VC0c22cJTbM_000424.npz", | |
| "dataset/vggsound/video_latents_t5_clip_npz/test/F3gsbUTdc2U_000090.npz", | |
| "dataset/vggsound/video_latents_t5_clip_npz/test/WatvT8A8iug_000100.npz", | |
| "dataset/vggsound/video_latents_t5_clip_npz/test/0nvBTp-q7tU_000112.npz", | |
| "dataset/vggsound/video_latents_t5_clip_npz/test/3-PFuDkTM48_000080.npz", | |
| "dataset/vggsound/video_latents_t5_clip_npz/test/luSAuu-BoPs_000232.npz", | |
| "dataset/vggsound/video_latents_t5_clip_npz/test/__8UJxW0aOQ_000002.npz", | |
| "dataset/vggsound/video_latents_t5_clip_npz/test/_0m_YMpQayA_000168.npz" | |
| ], | |
| "demo_cfg_scales": [5] | |
| } | |
| } | |
| } |