Spaces:
Running
on
Zero
Running
on
Zero
File size: 2,281 Bytes
142a1ac |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
defaults:
- base_pytorch_algo # inherits from configurations/algorithm/base_algo.yaml
- _self_
lr: ${experiment.training.lr}
betas: [0.9, 0.95]
weight_decay: 5e-2
lr_scheduler:
name: constant_with_warmup
num_warmup_steps: 1000
load_video_latent: ${dataset.load_video_latent} # if true, load latent from disk instead of using video vae
load_prompt_embed: ${dataset.load_prompt_embed} # if true, load prompt embedding from disk instead of running language model online
diffusion_forcing:
enabled: true
mode: rand_history # independent, rand_history
clean_hist_prob: 0.5 # probability of giving first frame image condition when finetuning image-to-video, overriding diffusion forcing's noise level for first frame
n_frames: ${dataset.n_frames}
height: ${dataset.height}
width: ${dataset.width}
num_train_timesteps: 1000
diffusion_type: "continuous" # or "discrete"
sample_solver: unipc
sample_steps: 40
sample_shift: 3.0
lang_guidance: 3.0
neg_prompt: ""
hist_guidance: 2.0 #2.0
sliding_hist: 1 # use 2 latent frames as history when extending videos
gradient_checkpointing_rate: 1.0 # gradient checkpointing blocks as a ratio of total blocks
max_text_tokens: 512
logging:
loss_freq: 1
video_freq: 1000
video_type: grid # grid or single
fps: ${dataset.fps}
serving:
port: 6688
text_encoder:
text_len: 512
text_dim: 4096
compile: false
name: google/umt5-xxl
ckpt_path: data/ckpts/Wan2.1-T2V-1.3B/models_t5_umt5-xxl-enc-bf16.pth
vae:
ckpt_path: data/ckpts/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth
compile: false
z_dim: 16
stride: [4, 8, 8]
mean: [-0.7571, -0.7089, -0.9113, 0.1075, -0.1745, 0.9653, -0.1517, 1.5508, 0.4134, -0.0715, 0.5517, -0.3632, -0.1922, -0.9497, 0.2503, -0.2921]
std: [2.8184, 1.4541, 2.3275, 2.6558, 1.2196, 1.7708, 2.6052, 2.0743, 3.2687, 2.1526, 2.8652, 1.5579, 1.6382, 1.1253, 2.8251, 1.9160]
model:
ckpt_path: data/ckpts/Wan2.1-T2V-1.3B
tuned_ckpt_path: null
compile: false #true
model_type: t2v # if i2v, this flag will let the model take in CLIP features
patch_size: [1, 2, 2]
in_dim: ${algorithm.vae.z_dim}
dim: 1536
ffn_dim: 8960
freq_dim: 256
out_dim: ${algorithm.vae.z_dim}
num_heads: 12
num_layers: 30
window_size: [-1, -1]
qk_norm: True
cross_attn_norm: True
eps: 1e-6
|