PoseModifier / UniAnimate /configs /UniAnimate_infer_long.yaml
Evgeny Zhukov
Origin: https://github.com/ali-vilab/UniAnimate/commit/d7814fa44a0a1154524b92fce0e3133a2604d333
2ba4412
# manual setting
# resolution: [512, 768] # or [768, 1216]
resolution: [768, 1216]
round: 1
ddim_timesteps: 30 # among 25-50
context_size: 32
context_stride: 1
context_overlap: 8
seed: 7
max_frames: "None" # 64, 96, "None" mean the length of original pose sequence
test_list_path: [
# Format: [frame_interval, reference image, driving pose sequence]
[2, "data/images/WOMEN-Blouses_Shirts-id_00004955-01_4_full.jpg", "data/saved_pose/WOMEN-Blouses_Shirts-id_00004955-01_4_full"],
[2, "data/images/musk.jpg", "data/saved_pose/musk"],
[2, "data/images/WOMEN-Blouses_Shirts-id_00005125-03_4_full.jpg", "data/saved_pose/WOMEN-Blouses_Shirts-id_00005125-03_4_full"],
[2, "data/images/IMG_20240514_104337.jpg", "data/saved_pose/IMG_20240514_104337"],
[2, "data/images/IMG_20240514_104337.jpg", "data/saved_pose/IMG_20240514_104337_dance"],
[2, "data/images/WOMEN-Blouses_Shirts-id_00005125-03_4_full.jpg", "data/saved_pose/WOMEN-Blouses_Shirts-id_00005125-03_4_full_dance"]
]
# default settings
TASK_TYPE: inference_unianimate_long_entrance
guide_scale: 2.5
vit_resolution: [224, 224]
use_fp16: True
batch_size: 1
latent_random_ref: True
chunk_size: 2
decoder_bs: 2
scale: 8
use_fps_condition: False
test_model: checkpoints/unianimate_16f_32f_non_ema_223000.pth
partial_keys: [
['image', 'randomref', "dwpose"],
]
embedder: {
'type': 'FrozenOpenCLIPTextVisualEmbedder',
'layer': 'penultimate',
'pretrained': 'checkpoints/open_clip_pytorch_model.bin'
}
auto_encoder: {
'type': 'AutoencoderKL',
'ddconfig': {
'double_z': True,
'z_channels': 4,
'resolution': 256,
'in_channels': 3,
'out_ch': 3,
'ch': 128,
'ch_mult': [1, 2, 4, 4],
'num_res_blocks': 2,
'attn_resolutions': [],
'dropout': 0.0,
'video_kernel_size': [3, 1, 1]
},
'embed_dim': 4,
'pretrained': 'checkpoints/v2-1_512-ema-pruned.ckpt'
}
UNet: {
'type': 'UNetSD_UniAnimate',
'config': None,
'in_dim': 4,
'dim': 320,
'y_dim': 1024,
'context_dim': 1024,
'out_dim': 4,
'dim_mult': [1, 2, 4, 4],
'num_heads': 8,
'head_dim': 64,
'num_res_blocks': 2,
'dropout': 0.1,
'temporal_attention': True,
'num_tokens': 4,
'temporal_attn_times': 1,
'use_checkpoint': True,
'use_fps_condition': False,
'use_sim_mask': False
}
video_compositions: ['image', 'local_image', 'dwpose', 'randomref', 'randomref_pose']
Diffusion: {
'type': 'DiffusionDDIMLong',
'schedule': 'linear_sd',
'schedule_param': {
'num_timesteps': 1000,
"init_beta": 0.00085,
"last_beta": 0.0120,
'zero_terminal_snr': True,
},
'mean_type': 'v',
'loss_type': 'mse',
'var_type': 'fixed_small',
'rescale_timesteps': False,
'noise_strength': 0.1
}
CPU_CLIP_VAE: True
context_batch_size: 1
noise_prior_value: 939 # or 999, 949