kiwhansong's picture
add demo
142a1ac
defaults:
- base_pytorch_algo # inherits from configurations/algorithm/base_algo.yaml
- _self_
lr: ${experiment.training.lr}
betas: [0.9, 0.95]
weight_decay: 5e-2
lr_scheduler:
name: constant_with_warmup
num_warmup_steps: 1000
load_video_latent: ${dataset.load_video_latent} # if true, load latent from disk instead of using video vae
load_prompt_embed: ${dataset.load_prompt_embed} # if true, load prompt embedding from disk instead of running language model online
diffusion_forcing:
enabled: true
mode: rand_history # independent, rand_history
clean_hist_prob: 0.5 # probability of giving first frame image condition when finetuning image-to-video, overriding diffusion forcing's noise level for first frame
n_frames: ${dataset.n_frames}
height: ${dataset.height}
width: ${dataset.width}
num_train_timesteps: 1000
diffusion_type: "continuous" # or "discrete"
sample_solver: unipc
sample_steps: 40
sample_shift: 3.0
lang_guidance: 3.0
neg_prompt: ""
hist_guidance: 2.0 #2.0
sliding_hist: 1 # use 2 latent frames as history when extending videos
gradient_checkpointing_rate: 1.0 # gradient checkpointing blocks as a ratio of total blocks
max_text_tokens: 512
logging:
loss_freq: 1
video_freq: 1000
video_type: grid # grid or single
fps: ${dataset.fps}
serving:
port: 6688
text_encoder:
text_len: 512
text_dim: 4096
compile: false
name: google/umt5-xxl
ckpt_path: data/ckpts/Wan2.1-T2V-1.3B/models_t5_umt5-xxl-enc-bf16.pth
vae:
ckpt_path: data/ckpts/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth
compile: false
z_dim: 16
stride: [4, 8, 8]
mean: [-0.7571, -0.7089, -0.9113, 0.1075, -0.1745, 0.9653, -0.1517, 1.5508, 0.4134, -0.0715, 0.5517, -0.3632, -0.1922, -0.9497, 0.2503, -0.2921]
std: [2.8184, 1.4541, 2.3275, 2.6558, 1.2196, 1.7708, 2.6052, 2.0743, 3.2687, 2.1526, 2.8652, 1.5579, 1.6382, 1.1253, 2.8251, 1.9160]
model:
ckpt_path: data/ckpts/Wan2.1-T2V-1.3B
tuned_ckpt_path: null
compile: false #true
model_type: t2v # if i2v, this flag will let the model take in CLIP features
patch_size: [1, 2, 2]
in_dim: ${algorithm.vae.z_dim}
dim: 1536
ffn_dim: 8960
freq_dim: 256
out_dim: ${algorithm.vae.z_dim}
num_heads: 12
num_layers: 30
window_size: [-1, -1]
qk_norm: True
cross_attn_norm: True
eps: 1e-6