| | |
| | debug: false |
| | seed: 39 |
| | root_name: audio_head_animator |
| | exp_name: ${root_name}/inference |
| | mode: train |
| | n_epochs: null |
| | cache_dir: cache |
| | ckpt_dir: ${exp_name}/ckpt |
| | resume_ckpt: null |
| |
|
| | only_resume_state_dict: False |
| | pretrained_ckpt: null |
| |
|
| | model: |
| | module_name: model.head_animation.head_animator |
| | class_name: HeadAnimatorModule |
| | pretrained_ckpt: ${pretrained_ckpt} |
| | using_hybrid_mask: True |
| | output_dir: ${exp_name} |
| | |
| | face_encoder: |
| | module_name: model.head_animation.LIA_3d.face_encoder |
| | class_name: FaceEncoder |
| | image_size: 512 |
| | image_channel: 3 |
| | block_expansion: 64 |
| | num_down_blocks: 3 |
| | max_features: 512 |
| | reshape_channel: 32 |
| | reshape_depth: 16 |
| | num_resblocks: 6 |
| |
|
| | motion_encoder: |
| | module_name: model.head_animation.LIA_3d.motion_encoder |
| | class_name: MotionEncoder |
| | latent_dim: 512 |
| | size: ${model.face_encoder.image_size} |
| |
|
| | flow_estimator: |
| | module_name: model.head_animation.LIA_3d.flow_estimator |
| | class_name: FlowEstimator |
| | latent_dim: ${model.motion_encoder.latent_dim} |
| | motion_space: 64 |
| |
|
| | face_generator: |
| | module_name: model.head_animation.LIA_3d.face_generator |
| | class_name: FaceGenerator |
| | size: ${model.face_encoder.image_size} |
| | latent_dim: ${model.motion_encoder.latent_dim} |
| | outputsize: ${data.train_width} |
| | reshape_channel: ${model.face_encoder.reshape_channel} |
| | group_norm_channel: 32 |
| | flag_estimate_occlusion_map: True |
| |
|
| | discriminator: |
| | module_name: model.head_animation.LIA.discriminator |
| | class_name: Discriminator |
| | size: ${data.train_width} |
| | |
| | vgg_loss: |
| | module_name: model.head_animation.VASA1.loss |
| | class_name: VGGLoss |
| |
|
| | loss: |
| | l_w_recon: 1 |
| | l_w_face_l1: 0 |
| | l_w_vgg: 2 |
| | l_w_gan: 0.2 |
| | l_w_face: 0 |
| | l_w_headpose: 0 |
| | l_w_gaze: 0 |
| | l_w_foreground: 0 |
| | l_w_local: 0 |
| |
|
| | optimizer: |
| | lr: 0.0001 |
| | discriminator_lr: 0.002 |
| | warmup_steps: 0 |
| | adam_beta1: 0.9 |
| | adam_beta2: 0.999 |
| | adam_epsilon: 1.0e-08 |
| | weight_decay: 0.0 |
| | g_reg_every: 4 |
| | d_reg_every: 16 |
| |
|
| | logger: |
| | neptune_project: null |
| | neptune_api_token: null |
| | wandb: |
| | enabled: false |
| | entity: null |
| | project: "real-time" |
| |
|
| | callbacks: |
| | - module_name: lightning.pytorch.callbacks |
| | class_name: ModelCheckpoint |
| | dirpath: ${ckpt_dir} |
| | every_n_train_steps: 2000 |
| | save_top_k: -1 |
| |
|
| | trainer: |
| | accelerator: gpu |
| | log_every_n_steps: 1 |
| | val_check_interval: 100000 |
| |
|
| | data: |
| | debug: False |
| | train_bs: 12 |
| | accumulate_grad_batches: 1 |
| | n_sample_frames: 1 |
| | past_n: 1 |
| | num_workers: 8 |
| | ref_sample_margin: 10 |
| | train_width: 512 |
| | train_height: 512 |
| | union_bbox_scale: [1.2, 1.4] |
| | mouth_bbox_scale: 1.5 |
| | eye_bbox_scale: 2.0 |
| | hybrid_face_mask: ${model.using_hybrid_mask} |
| | flip_aug: True |
| | filter_hand_videos: true |
| | random_sample: False |
| | dataset_file_path: [] |
| | cache_file_path: [] |
| | train_fps: 25 |
| | dataloader: FastVideoDatasetV2 |
| |
|
| | val_data: |
| | train_bs: 1 |
| | n_sample_frames: 40 |
| | past_n: 2 |
| | num_workers: 6 |
| | ref_sample_margin: ${data.ref_sample_margin} |
| | train_width: ${data.train_width} |
| | train_height: ${data.train_height} |
| | union_bbox_scale: [1.2, 1.4] |
| | mouth_bbox_scale: ${data.mouth_bbox_scale} |
| | eye_bbox_scale: ${data.eye_bbox_scale} |
| | hybrid_face_mask: ${data.hybrid_face_mask} |
| | flip_aug: False |
| | filter_hand_videos: ${data.filter_hand_videos} |
| | random_sample: False |
| | dataset_file_path: [] |
| | train_fps: ${data.train_fps} |
| | dataloader: ${data.dataloader} |
| |
|
| | test_data: |
| | height: 384 |
| | width: 672 |
| | image_paths_and_scales: [] |
| |
|
| | inference: |
| | output_dir: inference_outputs/${exp_name} |
| |
|