File size: 3,547 Bytes
872b1a7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
# 此配置文件主要用于 img_to_mask.py 获取 face detection 相关参数
debug: false
seed: 39
root_name: audio_head_animator
exp_name: ${root_name}/inference
mode: train
n_epochs: null
cache_dir: cache 
ckpt_dir: ${exp_name}/ckpt
resume_ckpt: null

only_resume_state_dict: False
pretrained_ckpt: null

model:
  module_name: model.head_animation.head_animator
  class_name: HeadAnimatorModule
  pretrained_ckpt: ${pretrained_ckpt}
  using_hybrid_mask: True
  output_dir: ${exp_name}
  
  face_encoder:
    module_name: model.head_animation.LIA_3d.face_encoder
    class_name: FaceEncoder
    image_size: 512
    image_channel: 3
    block_expansion: 64
    num_down_blocks: 3
    max_features: 512
    reshape_channel: 32
    reshape_depth: 16
    num_resblocks: 6

  motion_encoder:
    module_name: model.head_animation.LIA_3d.motion_encoder 
    class_name: MotionEncoder
    latent_dim: 512
    size: ${model.face_encoder.image_size}

  flow_estimator:
    module_name: model.head_animation.LIA_3d.flow_estimator
    class_name: FlowEstimator
    latent_dim: ${model.motion_encoder.latent_dim}
    motion_space: 64

  face_generator:
    module_name: model.head_animation.LIA_3d.face_generator
    class_name: FaceGenerator
    size:  ${model.face_encoder.image_size}
    latent_dim: ${model.motion_encoder.latent_dim}
    outputsize: ${data.train_width}
    reshape_channel: ${model.face_encoder.reshape_channel}
    group_norm_channel: 32
    flag_estimate_occlusion_map: True

  discriminator:
    module_name: model.head_animation.LIA.discriminator
    class_name: Discriminator
    size: ${data.train_width}
  
  vgg_loss:
    module_name: model.head_animation.VASA1.loss
    class_name: VGGLoss

loss:
  l_w_recon: 1
  l_w_face_l1: 0
  l_w_vgg: 2
  l_w_gan: 0.2
  l_w_face: 0
  l_w_headpose: 0
  l_w_gaze: 0
  l_w_foreground: 0
  l_w_local: 0

optimizer:
  lr: 0.0001
  discriminator_lr: 0.002
  warmup_steps: 0
  adam_beta1: 0.9
  adam_beta2: 0.999
  adam_epsilon: 1.0e-08
  weight_decay: 0.0
  g_reg_every: 4
  d_reg_every: 16

logger:
  neptune_project: null
  neptune_api_token: null
  wandb:
    enabled: false
    entity: null
    project: "real-time"

callbacks:
  - module_name: lightning.pytorch.callbacks
    class_name: ModelCheckpoint
    dirpath: ${ckpt_dir}
    every_n_train_steps: 2000
    save_top_k: -1

trainer:
  accelerator: gpu
  log_every_n_steps: 1
  val_check_interval: 100000

data:
  debug: False
  train_bs: 12
  accumulate_grad_batches: 1
  n_sample_frames: 1
  past_n: 1
  num_workers: 8
  ref_sample_margin: 10
  train_width: 512 
  train_height: 512 
  union_bbox_scale: [1.2, 1.4]
  mouth_bbox_scale: 1.5
  eye_bbox_scale: 2.0
  hybrid_face_mask: ${model.using_hybrid_mask}
  flip_aug: True
  filter_hand_videos: true
  random_sample: False
  dataset_file_path: []
  cache_file_path: []
  train_fps: 25
  dataloader: FastVideoDatasetV2

val_data:
  train_bs: 1
  n_sample_frames: 40
  past_n: 2
  num_workers: 6
  ref_sample_margin: ${data.ref_sample_margin}
  train_width: ${data.train_width} 
  train_height: ${data.train_height} 
  union_bbox_scale: [1.2, 1.4]
  mouth_bbox_scale: ${data.mouth_bbox_scale}
  eye_bbox_scale: ${data.eye_bbox_scale}
  hybrid_face_mask: ${data.hybrid_face_mask}
  flip_aug: False
  filter_hand_videos: ${data.filter_hand_videos}
  random_sample: False
  dataset_file_path: []
  train_fps: ${data.train_fps}
  dataloader: ${data.dataloader}

test_data:
  height: 384
  width: 672
  image_paths_and_scales: []

inference:
  output_dir: inference_outputs/${exp_name}