defaults: - base_pytorch_algo # inherits from configurations/algorithm/base_algo.yaml - _self_ lr: ${experiment.training.lr} betas: [0.9, 0.95] weight_decay: 5e-2 lr_scheduler: name: constant_with_warmup num_warmup_steps: 1000 load_video_latent: ${dataset.load_video_latent} # if true, load latent from disk instead of using video vae load_prompt_embed: ${dataset.load_prompt_embed} # if true, load prompt embedding from disk instead of running language model online diffusion_forcing: enabled: true mode: rand_history # independent, rand_history clean_hist_prob: 0.5 # probability of giving first frame image condition when finetuning image-to-video, overriding diffusion forcing's noise level for first frame n_frames: ${dataset.n_frames} height: ${dataset.height} width: ${dataset.width} num_train_timesteps: 1000 diffusion_type: "continuous" # or "discrete" sample_solver: unipc sample_steps: 40 sample_shift: 3.0 lang_guidance: 3.0 neg_prompt: "" hist_guidance: 2.0 #2.0 sliding_hist: 1 # use 2 latent frames as history when extending videos gradient_checkpointing_rate: 1.0 # gradient checkpointing blocks as a ratio of total blocks max_text_tokens: 512 logging: loss_freq: 1 video_freq: 1000 video_type: grid # grid or single fps: ${dataset.fps} serving: port: 6688 text_encoder: text_len: 512 text_dim: 4096 compile: false name: google/umt5-xxl ckpt_path: data/ckpts/Wan2.1-T2V-1.3B/models_t5_umt5-xxl-enc-bf16.pth vae: ckpt_path: data/ckpts/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth compile: false z_dim: 16 stride: [4, 8, 8] mean: [-0.7571, -0.7089, -0.9113, 0.1075, -0.1745, 0.9653, -0.1517, 1.5508, 0.4134, -0.0715, 0.5517, -0.3632, -0.1922, -0.9497, 0.2503, -0.2921] std: [2.8184, 1.4541, 2.3275, 2.6558, 1.2196, 1.7708, 2.6052, 2.0743, 3.2687, 2.1526, 2.8652, 1.5579, 1.6382, 1.1253, 2.8251, 1.9160] model: ckpt_path: data/ckpts/Wan2.1-T2V-1.3B tuned_ckpt_path: null compile: false #true model_type: t2v # if i2v, this flag will let the model take in CLIP features patch_size: [1, 2, 2] in_dim: ${algorithm.vae.z_dim} dim: 1536 ffn_dim: 8960 freq_dim: 256 out_dim: ${algorithm.vae.z_dim} num_heads: 12 num_layers: 30 window_size: [-1, -1] qk_norm: True cross_attn_norm: True eps: 1e-6