File size: 2,281 Bytes
142a1ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
defaults:
  - base_pytorch_algo # inherits from configurations/algorithm/base_algo.yaml
  - _self_

lr: ${experiment.training.lr}
betas: [0.9, 0.95]
weight_decay: 5e-2
lr_scheduler:
  name: constant_with_warmup
  num_warmup_steps: 1000

load_video_latent: ${dataset.load_video_latent} # if true, load latent from disk instead of using video vae
load_prompt_embed: ${dataset.load_prompt_embed} # if true, load prompt embedding from disk instead of running language model online

diffusion_forcing:
  enabled: true
  mode: rand_history # independent, rand_history
  clean_hist_prob: 0.5 # probability of giving first frame image condition when finetuning image-to-video, overriding diffusion forcing's noise level for first frame

n_frames: ${dataset.n_frames}
height: ${dataset.height}
width: ${dataset.width}
num_train_timesteps: 1000
diffusion_type: "continuous" # or "discrete"
sample_solver: unipc
sample_steps: 40
sample_shift: 3.0
lang_guidance: 3.0
neg_prompt: ""
hist_guidance: 2.0 #2.0
sliding_hist: 1 # use 2 latent frames as history when extending videos
gradient_checkpointing_rate: 1.0  # gradient checkpointing blocks as a ratio of total blocks
max_text_tokens: 512

logging:
  loss_freq: 1
  video_freq: 1000
  video_type: grid # grid or single
  fps: ${dataset.fps}

serving:
  port: 6688

text_encoder:
  text_len: 512
  text_dim: 4096
  compile: false
  name: google/umt5-xxl
  ckpt_path: data/ckpts/Wan2.1-T2V-1.3B/models_t5_umt5-xxl-enc-bf16.pth

vae:
  ckpt_path: data/ckpts/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth
  compile: false
  z_dim: 16
  stride: [4, 8, 8]
  mean: [-0.7571, -0.7089, -0.9113, 0.1075, -0.1745, 0.9653, -0.1517, 1.5508, 0.4134, -0.0715, 0.5517, -0.3632, -0.1922, -0.9497, 0.2503, -0.2921]
  std: [2.8184, 1.4541, 2.3275, 2.6558, 1.2196, 1.7708, 2.6052, 2.0743, 3.2687, 2.1526, 2.8652, 1.5579, 1.6382, 1.1253, 2.8251, 1.9160]
  
model:
  ckpt_path: data/ckpts/Wan2.1-T2V-1.3B
  tuned_ckpt_path: null
  compile: false #true
  model_type: t2v # if i2v, this flag will let the model take in CLIP features
  patch_size: [1, 2, 2]
  in_dim: ${algorithm.vae.z_dim}
  dim: 1536
  ffn_dim: 8960
  freq_dim: 256
  out_dim: ${algorithm.vae.z_dim}
  num_heads: 12
  num_layers: 30
  window_size: [-1, -1]
  qk_norm: True
  cross_attn_norm: True
  eps: 1e-6