Spaces:
Running
on
Zero
Running
on
Zero
| defaults: | |
| - base_pytorch_algo # inherits from configurations/algorithm/base_algo.yaml | |
| - _self_ | |
| lr: ${experiment.training.lr} | |
| betas: [0.9, 0.95] | |
| weight_decay: 5e-2 | |
| lr_scheduler: | |
| name: constant_with_warmup | |
| num_warmup_steps: 1000 | |
| load_video_latent: ${dataset.load_video_latent} # if true, load latent from disk instead of using video vae | |
| load_prompt_embed: ${dataset.load_prompt_embed} # if true, load prompt embedding from disk instead of running language model online | |
| diffusion_forcing: | |
| enabled: true | |
| mode: rand_history # independent, rand_history | |
| clean_hist_prob: 0.5 # probability of giving first frame image condition when finetuning image-to-video, overriding diffusion forcing's noise level for first frame | |
| n_frames: ${dataset.n_frames} | |
| height: ${dataset.height} | |
| width: ${dataset.width} | |
| num_train_timesteps: 1000 | |
| diffusion_type: "continuous" # or "discrete" | |
| sample_solver: unipc | |
| sample_steps: 40 | |
| sample_shift: 3.0 | |
| lang_guidance: 3.0 | |
| neg_prompt: "" | |
| hist_guidance: 2.0 #2.0 | |
| sliding_hist: 1 # use 2 latent frames as history when extending videos | |
| gradient_checkpointing_rate: 1.0 # gradient checkpointing blocks as a ratio of total blocks | |
| max_text_tokens: 512 | |
| logging: | |
| loss_freq: 1 | |
| video_freq: 1000 | |
| video_type: grid # grid or single | |
| fps: ${dataset.fps} | |
| serving: | |
| port: 6688 | |
| text_encoder: | |
| text_len: 512 | |
| text_dim: 4096 | |
| compile: false | |
| name: google/umt5-xxl | |
| ckpt_path: data/ckpts/Wan2.1-T2V-1.3B/models_t5_umt5-xxl-enc-bf16.pth | |
| vae: | |
| ckpt_path: data/ckpts/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth | |
| compile: false | |
| z_dim: 16 | |
| stride: [4, 8, 8] | |
| mean: [-0.7571, -0.7089, -0.9113, 0.1075, -0.1745, 0.9653, -0.1517, 1.5508, 0.4134, -0.0715, 0.5517, -0.3632, -0.1922, -0.9497, 0.2503, -0.2921] | |
| std: [2.8184, 1.4541, 2.3275, 2.6558, 1.2196, 1.7708, 2.6052, 2.0743, 3.2687, 2.1526, 2.8652, 1.5579, 1.6382, 1.1253, 2.8251, 1.9160] | |
| model: | |
| ckpt_path: data/ckpts/Wan2.1-T2V-1.3B | |
| tuned_ckpt_path: null | |
| compile: false #true | |
| model_type: t2v # if i2v, this flag will let the model take in CLIP features | |
| patch_size: [1, 2, 2] | |
| in_dim: ${algorithm.vae.z_dim} | |
| dim: 1536 | |
| ffn_dim: 8960 | |
| freq_dim: 256 | |
| out_dim: ${algorithm.vae.z_dim} | |
| num_heads: 12 | |
| num_layers: 30 | |
| window_size: [-1, -1] | |
| qk_norm: True | |
| cross_attn_norm: True | |
| eps: 1e-6 | |