# 此配置文件主要用于 img_to_mask.py 获取 face detection 相关参数 debug: false seed: 39 root_name: audio_head_animator exp_name: ${root_name}/inference mode: train n_epochs: null cache_dir: cache ckpt_dir: ${exp_name}/ckpt resume_ckpt: null only_resume_state_dict: False pretrained_ckpt: null model: module_name: model.head_animation.head_animator class_name: HeadAnimatorModule pretrained_ckpt: ${pretrained_ckpt} using_hybrid_mask: True output_dir: ${exp_name} face_encoder: module_name: model.head_animation.LIA_3d.face_encoder class_name: FaceEncoder image_size: 512 image_channel: 3 block_expansion: 64 num_down_blocks: 3 max_features: 512 reshape_channel: 32 reshape_depth: 16 num_resblocks: 6 motion_encoder: module_name: model.head_animation.LIA_3d.motion_encoder class_name: MotionEncoder latent_dim: 512 size: ${model.face_encoder.image_size} flow_estimator: module_name: model.head_animation.LIA_3d.flow_estimator class_name: FlowEstimator latent_dim: ${model.motion_encoder.latent_dim} motion_space: 64 face_generator: module_name: model.head_animation.LIA_3d.face_generator class_name: FaceGenerator size: ${model.face_encoder.image_size} latent_dim: ${model.motion_encoder.latent_dim} outputsize: ${data.train_width} reshape_channel: ${model.face_encoder.reshape_channel} group_norm_channel: 32 flag_estimate_occlusion_map: True discriminator: module_name: model.head_animation.LIA.discriminator class_name: Discriminator size: ${data.train_width} vgg_loss: module_name: model.head_animation.VASA1.loss class_name: VGGLoss loss: l_w_recon: 1 l_w_face_l1: 0 l_w_vgg: 2 l_w_gan: 0.2 l_w_face: 0 l_w_headpose: 0 l_w_gaze: 0 l_w_foreground: 0 l_w_local: 0 optimizer: lr: 0.0001 discriminator_lr: 0.002 warmup_steps: 0 adam_beta1: 0.9 adam_beta2: 0.999 adam_epsilon: 1.0e-08 weight_decay: 0.0 g_reg_every: 4 d_reg_every: 16 logger: neptune_project: null neptune_api_token: null wandb: enabled: false entity: null project: "real-time" callbacks: - module_name: lightning.pytorch.callbacks class_name: ModelCheckpoint dirpath: ${ckpt_dir} every_n_train_steps: 2000 save_top_k: -1 trainer: accelerator: gpu log_every_n_steps: 1 val_check_interval: 100000 data: debug: False train_bs: 12 accumulate_grad_batches: 1 n_sample_frames: 1 past_n: 1 num_workers: 8 ref_sample_margin: 10 train_width: 512 train_height: 512 union_bbox_scale: [1.2, 1.4] mouth_bbox_scale: 1.5 eye_bbox_scale: 2.0 hybrid_face_mask: ${model.using_hybrid_mask} flip_aug: True filter_hand_videos: true random_sample: False dataset_file_path: [] cache_file_path: [] train_fps: 25 dataloader: FastVideoDatasetV2 val_data: train_bs: 1 n_sample_frames: 40 past_n: 2 num_workers: 6 ref_sample_margin: ${data.ref_sample_margin} train_width: ${data.train_width} train_height: ${data.train_height} union_bbox_scale: [1.2, 1.4] mouth_bbox_scale: ${data.mouth_bbox_scale} eye_bbox_scale: ${data.eye_bbox_scale} hybrid_face_mask: ${data.hybrid_face_mask} flip_aug: False filter_hand_videos: ${data.filter_hand_videos} random_sample: False dataset_file_path: [] train_fps: ${data.train_fps} dataloader: ${data.dataloader} test_data: height: 384 width: 672 image_paths_and_scales: [] inference: output_dir: inference_outputs/${exp_name}