Commit
·
872b1a7
1
Parent(s):
1631026
upload ckpt
Browse files- checkpoints/last.ckpt +3 -0
- tools/pretrained_model/epoch=0-step=312000.ckpt +3 -0
- tools/visualization_0416/configs/audio_head_animator.yaml +154 -0
- tools/visualization_0416/configs/head_animator_best_0506.yaml +153 -0
- tools/visualization_0416/img_to_latent.py +71 -0
- tools/visualization_0416/img_to_mask.py +199 -0
- tools/visualization_0416/latent_to_video.py +156 -0
- tools/visualization_0416/latent_to_video_batch.py +249 -0
- tools/visualization_0416/utils/__init__.py +0 -0
- tools/visualization_0416/utils/face_detector.py +624 -0
- tools/visualization_0416/utils/face_landmarker.task +3 -0
checkpoints/last.ckpt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:15baf05196834ca54b4da7c2c9fd372b572e650b1edd84cb5a92c4be1689f29b
|
| 3 |
+
size 7730126719
|
tools/pretrained_model/epoch=0-step=312000.ckpt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a9f06f55912dfa12ea18d77315f1c1675c5f67669097db8f155b4a4d75f9d31d
|
| 3 |
+
size 1579672001
|
tools/visualization_0416/configs/audio_head_animator.yaml
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 此配置文件主要用于 img_to_mask.py 获取 face detection 相关参数
|
| 2 |
+
debug: false
|
| 3 |
+
seed: 39
|
| 4 |
+
root_name: audio_head_animator
|
| 5 |
+
exp_name: ${root_name}/inference
|
| 6 |
+
mode: train
|
| 7 |
+
n_epochs: null
|
| 8 |
+
cache_dir: cache
|
| 9 |
+
ckpt_dir: ${exp_name}/ckpt
|
| 10 |
+
resume_ckpt: null
|
| 11 |
+
|
| 12 |
+
only_resume_state_dict: False
|
| 13 |
+
pretrained_ckpt: null
|
| 14 |
+
|
| 15 |
+
model:
|
| 16 |
+
module_name: model.head_animation.head_animator
|
| 17 |
+
class_name: HeadAnimatorModule
|
| 18 |
+
pretrained_ckpt: ${pretrained_ckpt}
|
| 19 |
+
using_hybrid_mask: True
|
| 20 |
+
output_dir: ${exp_name}
|
| 21 |
+
|
| 22 |
+
face_encoder:
|
| 23 |
+
module_name: model.head_animation.LIA_3d.face_encoder
|
| 24 |
+
class_name: FaceEncoder
|
| 25 |
+
image_size: 512
|
| 26 |
+
image_channel: 3
|
| 27 |
+
block_expansion: 64
|
| 28 |
+
num_down_blocks: 3
|
| 29 |
+
max_features: 512
|
| 30 |
+
reshape_channel: 32
|
| 31 |
+
reshape_depth: 16
|
| 32 |
+
num_resblocks: 6
|
| 33 |
+
|
| 34 |
+
motion_encoder:
|
| 35 |
+
module_name: model.head_animation.LIA_3d.motion_encoder
|
| 36 |
+
class_name: MotionEncoder
|
| 37 |
+
latent_dim: 512
|
| 38 |
+
size: ${model.face_encoder.image_size}
|
| 39 |
+
|
| 40 |
+
flow_estimator:
|
| 41 |
+
module_name: model.head_animation.LIA_3d.flow_estimator
|
| 42 |
+
class_name: FlowEstimator
|
| 43 |
+
latent_dim: ${model.motion_encoder.latent_dim}
|
| 44 |
+
motion_space: 64
|
| 45 |
+
|
| 46 |
+
face_generator:
|
| 47 |
+
module_name: model.head_animation.LIA_3d.face_generator
|
| 48 |
+
class_name: FaceGenerator
|
| 49 |
+
size: ${model.face_encoder.image_size}
|
| 50 |
+
latent_dim: ${model.motion_encoder.latent_dim}
|
| 51 |
+
outputsize: ${data.train_width}
|
| 52 |
+
reshape_channel: ${model.face_encoder.reshape_channel}
|
| 53 |
+
group_norm_channel: 32
|
| 54 |
+
flag_estimate_occlusion_map: True
|
| 55 |
+
|
| 56 |
+
discriminator:
|
| 57 |
+
module_name: model.head_animation.LIA.discriminator
|
| 58 |
+
class_name: Discriminator
|
| 59 |
+
size: ${data.train_width}
|
| 60 |
+
|
| 61 |
+
vgg_loss:
|
| 62 |
+
module_name: model.head_animation.VASA1.loss
|
| 63 |
+
class_name: VGGLoss
|
| 64 |
+
|
| 65 |
+
loss:
|
| 66 |
+
l_w_recon: 1
|
| 67 |
+
l_w_face_l1: 0
|
| 68 |
+
l_w_vgg: 2
|
| 69 |
+
l_w_gan: 0.2
|
| 70 |
+
l_w_face: 0
|
| 71 |
+
l_w_headpose: 0
|
| 72 |
+
l_w_gaze: 0
|
| 73 |
+
l_w_foreground: 0
|
| 74 |
+
l_w_local: 0
|
| 75 |
+
|
| 76 |
+
optimizer:
|
| 77 |
+
lr: 0.0001
|
| 78 |
+
discriminator_lr: 0.002
|
| 79 |
+
warmup_steps: 0
|
| 80 |
+
adam_beta1: 0.9
|
| 81 |
+
adam_beta2: 0.999
|
| 82 |
+
adam_epsilon: 1.0e-08
|
| 83 |
+
weight_decay: 0.0
|
| 84 |
+
g_reg_every: 4
|
| 85 |
+
d_reg_every: 16
|
| 86 |
+
|
| 87 |
+
logger:
|
| 88 |
+
neptune_project: null
|
| 89 |
+
neptune_api_token: null
|
| 90 |
+
wandb:
|
| 91 |
+
enabled: false
|
| 92 |
+
entity: null
|
| 93 |
+
project: "real-time"
|
| 94 |
+
|
| 95 |
+
callbacks:
|
| 96 |
+
- module_name: lightning.pytorch.callbacks
|
| 97 |
+
class_name: ModelCheckpoint
|
| 98 |
+
dirpath: ${ckpt_dir}
|
| 99 |
+
every_n_train_steps: 2000
|
| 100 |
+
save_top_k: -1
|
| 101 |
+
|
| 102 |
+
trainer:
|
| 103 |
+
accelerator: gpu
|
| 104 |
+
log_every_n_steps: 1
|
| 105 |
+
val_check_interval: 100000
|
| 106 |
+
|
| 107 |
+
data:
|
| 108 |
+
debug: False
|
| 109 |
+
train_bs: 12
|
| 110 |
+
accumulate_grad_batches: 1
|
| 111 |
+
n_sample_frames: 1
|
| 112 |
+
past_n: 1
|
| 113 |
+
num_workers: 8
|
| 114 |
+
ref_sample_margin: 10
|
| 115 |
+
train_width: 512
|
| 116 |
+
train_height: 512
|
| 117 |
+
union_bbox_scale: [1.2, 1.4]
|
| 118 |
+
mouth_bbox_scale: 1.5
|
| 119 |
+
eye_bbox_scale: 2.0
|
| 120 |
+
hybrid_face_mask: ${model.using_hybrid_mask}
|
| 121 |
+
flip_aug: True
|
| 122 |
+
filter_hand_videos: true
|
| 123 |
+
random_sample: False
|
| 124 |
+
dataset_file_path: []
|
| 125 |
+
cache_file_path: []
|
| 126 |
+
train_fps: 25
|
| 127 |
+
dataloader: FastVideoDatasetV2
|
| 128 |
+
|
| 129 |
+
val_data:
|
| 130 |
+
train_bs: 1
|
| 131 |
+
n_sample_frames: 40
|
| 132 |
+
past_n: 2
|
| 133 |
+
num_workers: 6
|
| 134 |
+
ref_sample_margin: ${data.ref_sample_margin}
|
| 135 |
+
train_width: ${data.train_width}
|
| 136 |
+
train_height: ${data.train_height}
|
| 137 |
+
union_bbox_scale: [1.2, 1.4]
|
| 138 |
+
mouth_bbox_scale: ${data.mouth_bbox_scale}
|
| 139 |
+
eye_bbox_scale: ${data.eye_bbox_scale}
|
| 140 |
+
hybrid_face_mask: ${data.hybrid_face_mask}
|
| 141 |
+
flip_aug: False
|
| 142 |
+
filter_hand_videos: ${data.filter_hand_videos}
|
| 143 |
+
random_sample: False
|
| 144 |
+
dataset_file_path: []
|
| 145 |
+
train_fps: ${data.train_fps}
|
| 146 |
+
dataloader: ${data.dataloader}
|
| 147 |
+
|
| 148 |
+
test_data:
|
| 149 |
+
height: 384
|
| 150 |
+
width: 672
|
| 151 |
+
image_paths_and_scales: []
|
| 152 |
+
|
| 153 |
+
inference:
|
| 154 |
+
output_dir: inference_outputs/${exp_name}
|
tools/visualization_0416/configs/head_animator_best_0506.yaml
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
debug: false
|
| 2 |
+
seed: 39
|
| 3 |
+
root_name: head_animator_LIA3D
|
| 4 |
+
exp_name: ${root_name}/inference
|
| 5 |
+
mode: train
|
| 6 |
+
n_epochs: null
|
| 7 |
+
cache_dir: cache
|
| 8 |
+
ckpt_dir: ${exp_name}/ckpt
|
| 9 |
+
resume_ckpt: ../pretrained_model/epoch=0-step=312000.ckpt
|
| 10 |
+
|
| 11 |
+
only_resume_state_dict: False
|
| 12 |
+
pretrained_ckpt: null
|
| 13 |
+
|
| 14 |
+
model:
|
| 15 |
+
module_name: model.head_animation.head_animator
|
| 16 |
+
class_name: HeadAnimatorModule
|
| 17 |
+
pretrained_ckpt: ${pretrained_ckpt}
|
| 18 |
+
using_hybrid_mask: True
|
| 19 |
+
output_dir: ${exp_name}
|
| 20 |
+
|
| 21 |
+
face_encoder:
|
| 22 |
+
module_name: model.head_animation.LIA_3d.face_encoder
|
| 23 |
+
class_name: FaceEncoder
|
| 24 |
+
image_size: 512
|
| 25 |
+
image_channel: 3
|
| 26 |
+
block_expansion: 64
|
| 27 |
+
num_down_blocks: 3
|
| 28 |
+
max_features: 512
|
| 29 |
+
reshape_channel: 32
|
| 30 |
+
reshape_depth: 16
|
| 31 |
+
num_resblocks: 6
|
| 32 |
+
|
| 33 |
+
motion_encoder:
|
| 34 |
+
module_name: model.head_animation.LIA_3d.motion_encoder
|
| 35 |
+
class_name: MotionEncoder
|
| 36 |
+
latent_dim: 512
|
| 37 |
+
size: ${model.face_encoder.image_size}
|
| 38 |
+
|
| 39 |
+
flow_estimator:
|
| 40 |
+
module_name: model.head_animation.LIA_3d.flow_estimator
|
| 41 |
+
class_name: FlowEstimator
|
| 42 |
+
latent_dim: ${model.motion_encoder.latent_dim}
|
| 43 |
+
motion_space: 64
|
| 44 |
+
|
| 45 |
+
face_generator:
|
| 46 |
+
module_name: model.head_animation.LIA_3d.face_generator
|
| 47 |
+
class_name: FaceGenerator
|
| 48 |
+
size: ${model.face_encoder.image_size}
|
| 49 |
+
latent_dim: ${model.motion_encoder.latent_dim}
|
| 50 |
+
outputsize: ${data.train_width}
|
| 51 |
+
reshape_channel: ${model.face_encoder.reshape_channel}
|
| 52 |
+
group_norm_channel: 32
|
| 53 |
+
flag_estimate_occlusion_map: True
|
| 54 |
+
|
| 55 |
+
discriminator:
|
| 56 |
+
module_name: model.head_animation.LIA.discriminator
|
| 57 |
+
class_name: Discriminator
|
| 58 |
+
size: ${data.train_width}
|
| 59 |
+
|
| 60 |
+
vgg_loss:
|
| 61 |
+
module_name: model.head_animation.VASA1.loss
|
| 62 |
+
class_name: VGGLoss
|
| 63 |
+
|
| 64 |
+
loss:
|
| 65 |
+
l_w_recon: 1
|
| 66 |
+
l_w_face_l1: 0
|
| 67 |
+
l_w_vgg: 2
|
| 68 |
+
l_w_gan: 0.2
|
| 69 |
+
l_w_face: 0
|
| 70 |
+
l_w_headpose: 0
|
| 71 |
+
l_w_gaze: 0
|
| 72 |
+
l_w_foreground: 0
|
| 73 |
+
l_w_local: 0
|
| 74 |
+
|
| 75 |
+
optimizer:
|
| 76 |
+
lr: 0.0001
|
| 77 |
+
discriminator_lr: 0.002
|
| 78 |
+
warmup_steps: 0
|
| 79 |
+
adam_beta1: 0.9
|
| 80 |
+
adam_beta2: 0.999
|
| 81 |
+
adam_epsilon: 1.0e-08
|
| 82 |
+
weight_decay: 0.0
|
| 83 |
+
g_reg_every: 4
|
| 84 |
+
d_reg_every: 16
|
| 85 |
+
|
| 86 |
+
logger:
|
| 87 |
+
neptune_project: null
|
| 88 |
+
neptune_api_token: null
|
| 89 |
+
wandb:
|
| 90 |
+
enabled: false
|
| 91 |
+
entity: null
|
| 92 |
+
project: "real-time"
|
| 93 |
+
|
| 94 |
+
callbacks:
|
| 95 |
+
- module_name: lightning.pytorch.callbacks
|
| 96 |
+
class_name: ModelCheckpoint
|
| 97 |
+
dirpath: ${ckpt_dir}
|
| 98 |
+
every_n_train_steps: 2000
|
| 99 |
+
save_top_k: -1
|
| 100 |
+
|
| 101 |
+
trainer:
|
| 102 |
+
accelerator: gpu
|
| 103 |
+
log_every_n_steps: 1
|
| 104 |
+
val_check_interval: 100000
|
| 105 |
+
|
| 106 |
+
data:
|
| 107 |
+
debug: False
|
| 108 |
+
train_bs: 12
|
| 109 |
+
accumulate_grad_batches: 1
|
| 110 |
+
n_sample_frames: 1
|
| 111 |
+
past_n: 1
|
| 112 |
+
num_workers: 8
|
| 113 |
+
ref_sample_margin: 10
|
| 114 |
+
train_width: 512
|
| 115 |
+
train_height: 512
|
| 116 |
+
union_bbox_scale: [1.2, 1.4]
|
| 117 |
+
mouth_bbox_scale: 1.5
|
| 118 |
+
eye_bbox_scale: 2.0
|
| 119 |
+
hybrid_face_mask: ${model.using_hybrid_mask}
|
| 120 |
+
flip_aug: True
|
| 121 |
+
filter_hand_videos: true
|
| 122 |
+
random_sample: False
|
| 123 |
+
dataset_file_path: []
|
| 124 |
+
cache_file_path: []
|
| 125 |
+
train_fps: 25
|
| 126 |
+
dataloader: FastVideoDatasetV2
|
| 127 |
+
|
| 128 |
+
val_data:
|
| 129 |
+
train_bs: 1
|
| 130 |
+
n_sample_frames: 40
|
| 131 |
+
past_n: 2
|
| 132 |
+
num_workers: 6
|
| 133 |
+
ref_sample_margin: ${data.ref_sample_margin}
|
| 134 |
+
train_width: ${data.train_width}
|
| 135 |
+
train_height: ${data.train_height}
|
| 136 |
+
union_bbox_scale: [1.2, 1.4]
|
| 137 |
+
mouth_bbox_scale: ${data.mouth_bbox_scale}
|
| 138 |
+
eye_bbox_scale: ${data.eye_bbox_scale}
|
| 139 |
+
hybrid_face_mask: ${data.hybrid_face_mask}
|
| 140 |
+
flip_aug: False
|
| 141 |
+
filter_hand_videos: ${data.filter_hand_videos}
|
| 142 |
+
random_sample: False
|
| 143 |
+
dataset_file_path: []
|
| 144 |
+
train_fps: ${data.train_fps}
|
| 145 |
+
dataloader: ${data.dataloader}
|
| 146 |
+
|
| 147 |
+
test_data:
|
| 148 |
+
height: 384
|
| 149 |
+
width: 672
|
| 150 |
+
image_paths_and_scales: []
|
| 151 |
+
|
| 152 |
+
inference:
|
| 153 |
+
output_dir: inference_outputs/${exp_name}
|
tools/visualization_0416/img_to_latent.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
+
# 获取项目根目录并添加到 sys.path 最前面,确保导入正确的 utils 模块
|
| 5 |
+
_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 6 |
+
_PROJECT_ROOT = os.path.abspath(os.path.join(_SCRIPT_DIR, '..', '..'))
|
| 7 |
+
if _PROJECT_ROOT not in sys.path:
|
| 8 |
+
sys.path.insert(0, _PROJECT_ROOT)
|
| 9 |
+
|
| 10 |
+
import numpy as np
|
| 11 |
+
import torch
|
| 12 |
+
from PIL import Image
|
| 13 |
+
import torchvision.transforms as T
|
| 14 |
+
from omegaconf import OmegaConf
|
| 15 |
+
import fire
|
| 16 |
+
|
| 17 |
+
def init_fn(config_path):
|
| 18 |
+
from utils import instantiate
|
| 19 |
+
transform = T.Compose([T.Resize((512, 512)), T.ToTensor(), T.Normalize([0.5], [0.5])])
|
| 20 |
+
config = OmegaConf.load(config_path)
|
| 21 |
+
module = instantiate(config.model, instantiate_module=False)
|
| 22 |
+
model = module(config=config)
|
| 23 |
+
checkpoint = torch.load(config.resume_ckpt, map_location="cpu")
|
| 24 |
+
model.load_state_dict(checkpoint["state_dict"], strict=False)
|
| 25 |
+
model.eval()
|
| 26 |
+
motion_encoder = model.motion_encoder
|
| 27 |
+
return {"transform": transform, "motion_encoder": motion_encoder}
|
| 28 |
+
|
| 29 |
+
def extract_motion_latent(
|
| 30 |
+
mask_image_path='./test_case/test_img_masked.png',
|
| 31 |
+
config_path='./configs/head_animator_best_0506.yaml',
|
| 32 |
+
save_npz_path='./test_case/test_img_resize.npz',
|
| 33 |
+
version="0506"):
|
| 34 |
+
sys.path.insert(0, f'./utils/model_{version}')
|
| 35 |
+
config_path = config_path.replace("0506", version)
|
| 36 |
+
context = init_fn(config_path)
|
| 37 |
+
transform = context["transform"]
|
| 38 |
+
motion_encoder = context["motion_encoder"]
|
| 39 |
+
img = Image.open(mask_image_path).convert("RGB")
|
| 40 |
+
img_tensor = transform(img).unsqueeze(0)
|
| 41 |
+
with torch.no_grad():
|
| 42 |
+
latent = motion_encoder(img_tensor)[0] # [1, 512]
|
| 43 |
+
latent_np = latent.numpy()
|
| 44 |
+
|
| 45 |
+
# 如果文件已存在,先加载原有数据
|
| 46 |
+
if os.path.exists(save_npz_path):
|
| 47 |
+
existing_data = np.load(save_npz_path, allow_pickle=True)
|
| 48 |
+
data_dict = dict(existing_data)
|
| 49 |
+
existing_data.close() # 关闭文件
|
| 50 |
+
else:
|
| 51 |
+
data_dict = {}
|
| 52 |
+
|
| 53 |
+
# 更新或添加新的键值对
|
| 54 |
+
data_dict.update({
|
| 55 |
+
'video_id': os.path.basename(save_npz_path)[:-4],
|
| 56 |
+
'mask_img_path': mask_image_path,
|
| 57 |
+
'ref_img_path': save_npz_path.replace('npz', 'png'),
|
| 58 |
+
'motion_latent': latent_np
|
| 59 |
+
})
|
| 60 |
+
|
| 61 |
+
# 保存更新后的数据
|
| 62 |
+
np.savez(save_npz_path, **data_dict)
|
| 63 |
+
# np.savez(
|
| 64 |
+
# save_npz_path,
|
| 65 |
+
# video_id=os.path.basename(save_npz_path)[:-4],
|
| 66 |
+
# mask_img_path=mask_image_path,
|
| 67 |
+
# ref_img_path=save_npz_path.replace('npz', 'png'),
|
| 68 |
+
# motion_latent=latent_np
|
| 69 |
+
# )
|
| 70 |
+
if __name__ == '__main__':
|
| 71 |
+
fire.Fire(extract_motion_latent)
|
tools/visualization_0416/img_to_mask.py
ADDED
|
@@ -0,0 +1,199 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
input: image_path
|
| 3 |
+
output: save a masked image and resized image
|
| 4 |
+
"""
|
| 5 |
+
import os
|
| 6 |
+
import sys
|
| 7 |
+
import urllib.request
|
| 8 |
+
import numpy as np
|
| 9 |
+
import torch
|
| 10 |
+
import cv2
|
| 11 |
+
from PIL import Image
|
| 12 |
+
from omegaconf import OmegaConf
|
| 13 |
+
from torchvision import transforms
|
| 14 |
+
from utils.face_detector import FaceDetector
|
| 15 |
+
from pathlib import Path
|
| 16 |
+
|
| 17 |
+
def generate_crop_bounding_box(h, w, center, size=512):
|
| 18 |
+
"""
|
| 19 |
+
Crop a region of a specified size from the given center point,
|
| 20 |
+
filling the area outside the image boundary with zeros.
|
| 21 |
+
|
| 22 |
+
:param image: The input image in NumPy array form, shape (H, W, C)
|
| 23 |
+
:param center: The center point (y, x) to start cropping from
|
| 24 |
+
:param size: The size of the cropped region (default is 512)
|
| 25 |
+
:return: The cropped region with padding, shape (size, size, C)
|
| 26 |
+
"""
|
| 27 |
+
half_size = size // 2 # Half the size for the cropping region
|
| 28 |
+
|
| 29 |
+
# Calculate the top-left and bottom-right coordinates of the cropping region
|
| 30 |
+
y1 = max(center[0] - half_size, 0) # Ensure the y1 index is not less than 0
|
| 31 |
+
x1 = max(center[1] - half_size, 0) # Ensure the x1 index is not less than 0
|
| 32 |
+
y2 = min(center[0] + half_size, h) # Ensure the y2 index does not exceed the image height
|
| 33 |
+
x2 = min(center[1] + half_size, w) # Ensure the x2 index does not exceed the image width
|
| 34 |
+
return [x1, y1, x2, y2]
|
| 35 |
+
|
| 36 |
+
def crop_from_bbox(image, center, bbox, size=512):
|
| 37 |
+
"""
|
| 38 |
+
Crop a region of a specified size from the given center point,
|
| 39 |
+
filling the area outside the image boundary with zeros.
|
| 40 |
+
|
| 41 |
+
:param image: The input image in NumPy array form, shape (H, W, C)
|
| 42 |
+
:param center: The center point (y, x) to start cropping from
|
| 43 |
+
:param size: The size of the cropped region (default is 512)
|
| 44 |
+
:return: The cropped region with padding, shape (size, size, C)
|
| 45 |
+
"""
|
| 46 |
+
h, w = image.shape[:2] # Get the height and width of the image
|
| 47 |
+
x1, y1, x2, y2 = bbox
|
| 48 |
+
half_size = size // 2 # Half the size for the cropping region
|
| 49 |
+
# Create a zero-filled array for padding
|
| 50 |
+
cropped = np.zeros((size, size, image.shape[2]), dtype=image.dtype)
|
| 51 |
+
|
| 52 |
+
# Copy the valid region from the original image to the cropped region
|
| 53 |
+
cropped[(y1 - (center[0] - half_size)):(y2 - (center[0] - half_size)),
|
| 54 |
+
(x1 - (center[1] - half_size)):(x2 - (center[1] - half_size))] = image[y1:y2, x1:x2]
|
| 55 |
+
|
| 56 |
+
return cropped
|
| 57 |
+
|
| 58 |
+
face_detector = None
|
| 59 |
+
model_path = "./utils/face_landmarker.task"
|
| 60 |
+
if not os.path.exists(model_path):
|
| 61 |
+
print("Downloading face landmarker model...")
|
| 62 |
+
url = "https://storage.googleapis.com/mediapipe-models/face_landmarker/face_landmarker/float16/1/face_landmarker.task"
|
| 63 |
+
urllib.request.urlretrieve(url, model_path)
|
| 64 |
+
|
| 65 |
+
def initialize_face_detector():
|
| 66 |
+
global face_detector
|
| 67 |
+
if face_detector is None:
|
| 68 |
+
face_detector = FaceDetector(
|
| 69 |
+
mediapipe_model_asset_path=model_path,
|
| 70 |
+
face_detection_confidence=0.5,
|
| 71 |
+
num_faces=1,
|
| 72 |
+
)
|
| 73 |
+
initialize_face_detector()
|
| 74 |
+
|
| 75 |
+
def augmentation(images, transform, state=None):
|
| 76 |
+
if state is not None:
|
| 77 |
+
torch.set_rng_state(state)
|
| 78 |
+
if isinstance(images, list):
|
| 79 |
+
transformed = [transforms.functional.to_tensor(img) for img in images]
|
| 80 |
+
return transform(torch.stack(transformed, dim=0))
|
| 81 |
+
return transform(transforms.functional.to_tensor(images))
|
| 82 |
+
|
| 83 |
+
def scale_bbox(bbox, h, w, scale=1.8):
|
| 84 |
+
sw = (bbox[2] - bbox[0]) / 2
|
| 85 |
+
sh = (bbox[3] - bbox[1]) / 2
|
| 86 |
+
cx = (bbox[0] + bbox[2]) / 2
|
| 87 |
+
cy = (bbox[1] + bbox[3]) / 2
|
| 88 |
+
sw *= scale
|
| 89 |
+
sh *= scale
|
| 90 |
+
scaled = [cx - sw, cy - sh, cx + sw, cy + sh]
|
| 91 |
+
scaled[0] = np.clip(scaled[0], 0, w)
|
| 92 |
+
scaled[2] = np.clip(scaled[2], 0, w)
|
| 93 |
+
scaled[1] = np.clip(scaled[1], 0, h)
|
| 94 |
+
scaled[3] = np.clip(scaled[3], 0, h)
|
| 95 |
+
return scaled
|
| 96 |
+
|
| 97 |
+
def get_mask(bbox, hd, wd, scale=1.0, return_pil=True):
|
| 98 |
+
if min(bbox) < 0:
|
| 99 |
+
raise Exception("Invalid mask")
|
| 100 |
+
bbox = scale_bbox(bbox, hd, wd, scale=scale)
|
| 101 |
+
x0, y0, x1, y1 = [int(v) for v in bbox]
|
| 102 |
+
mask = np.zeros((hd, wd, 3), dtype=np.uint8)
|
| 103 |
+
mask[y0:y1, x0:x1, :] = 255
|
| 104 |
+
if return_pil:
|
| 105 |
+
return Image.fromarray(mask)
|
| 106 |
+
return mask
|
| 107 |
+
|
| 108 |
+
def generate_masked_image(
|
| 109 |
+
image_path="./test_case/test_img.png",
|
| 110 |
+
save_path="./test_case/test_img.png",
|
| 111 |
+
crop=False,
|
| 112 |
+
union_bbox_scale=1.3):
|
| 113 |
+
cfg = OmegaConf.load("./configs/audio_head_animator.yaml")
|
| 114 |
+
pixel_transform = transforms.Compose([
|
| 115 |
+
transforms.Resize(512, interpolation=transforms.InterpolationMode.BICUBIC),
|
| 116 |
+
transforms.Normalize([0.5], [0.5]),
|
| 117 |
+
])
|
| 118 |
+
resize_transform = transforms.Resize((512, 512), interpolation=transforms.InterpolationMode.BICUBIC)
|
| 119 |
+
|
| 120 |
+
img = Image.open(image_path).convert("RGB")
|
| 121 |
+
state = torch.get_rng_state()
|
| 122 |
+
|
| 123 |
+
# Get face detection results first
|
| 124 |
+
det_res = face_detector.get_face_xy_rotation_and_keypoints(
|
| 125 |
+
np.array(img), cfg.data.mouth_bbox_scale, cfg.data.eye_bbox_scale
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
person_id = 0
|
| 129 |
+
mouth_bbox = np.array(det_res[6][person_id])
|
| 130 |
+
eye_bbox = det_res[7][person_id]
|
| 131 |
+
face_contour = np.array(det_res[8][person_id])
|
| 132 |
+
left_eye_bbox = eye_bbox["left_eye"]
|
| 133 |
+
right_eye_bbox = eye_bbox["right_eye"]
|
| 134 |
+
|
| 135 |
+
# If crop is True, crop the face region first
|
| 136 |
+
if crop:
|
| 137 |
+
# Get the face bounding box and calculate center
|
| 138 |
+
face_bbox = det_res[5][person_id] # Get the face bounding box from det_res[5]
|
| 139 |
+
# face_bbox is [(x1, y1), (x2, y2)]
|
| 140 |
+
x1, y1 = face_bbox[0]
|
| 141 |
+
x2, y2 = face_bbox[1]
|
| 142 |
+
center = [(y1 + y2) // 2, (x1 + x2) // 2]
|
| 143 |
+
|
| 144 |
+
# Calculate the size for cropping
|
| 145 |
+
width = x2 - x1
|
| 146 |
+
height = y2 - y1
|
| 147 |
+
max_size = int(max(width, height) * union_bbox_scale)
|
| 148 |
+
|
| 149 |
+
# Get the image dimensions
|
| 150 |
+
hd, wd = img.size[1], img.size[0]
|
| 151 |
+
|
| 152 |
+
# Generate the crop bounding box
|
| 153 |
+
crop_bbox = generate_crop_bounding_box(hd, wd, center, max_size)
|
| 154 |
+
|
| 155 |
+
# Crop the image
|
| 156 |
+
img_array = np.array(img)
|
| 157 |
+
cropped_img = crop_from_bbox(img_array, center, crop_bbox, size=max_size)
|
| 158 |
+
img = Image.fromarray(cropped_img)
|
| 159 |
+
|
| 160 |
+
# Update the face detection results for the cropped image
|
| 161 |
+
det_res = face_detector.get_face_xy_rotation_and_keypoints(
|
| 162 |
+
cropped_img, cfg.data.mouth_bbox_scale, cfg.data.eye_bbox_scale
|
| 163 |
+
)
|
| 164 |
+
mouth_bbox = np.array(det_res[6][person_id])
|
| 165 |
+
eye_bbox = det_res[7][person_id]
|
| 166 |
+
face_contour = np.array(det_res[8][person_id])
|
| 167 |
+
left_eye_bbox = eye_bbox["left_eye"]
|
| 168 |
+
right_eye_bbox = eye_bbox["right_eye"]
|
| 169 |
+
|
| 170 |
+
pixel_values_ref = augmentation([img], pixel_transform, state)
|
| 171 |
+
pixel_values_ref = (pixel_values_ref + 1) / 2
|
| 172 |
+
new_hd, new_wd = img.size[1], img.size[0]
|
| 173 |
+
|
| 174 |
+
mouth_mask = resize_transform(get_mask(mouth_bbox, new_hd, new_wd, scale=1.0))
|
| 175 |
+
left_eye_mask = resize_transform(get_mask(left_eye_bbox, new_hd, new_wd, scale=1.0))
|
| 176 |
+
right_eye_mask = resize_transform(get_mask(right_eye_bbox, new_hd, new_wd, scale=1.0))
|
| 177 |
+
face_contour = resize_transform(Image.fromarray(face_contour))
|
| 178 |
+
|
| 179 |
+
eye_mask = np.bitwise_or(np.array(left_eye_mask), np.array(right_eye_mask))
|
| 180 |
+
combined_mask = np.bitwise_or(eye_mask, np.array(mouth_mask))
|
| 181 |
+
|
| 182 |
+
combined_mask_tensor = torch.from_numpy(combined_mask / 255.0).permute(2, 0, 1).unsqueeze(0)
|
| 183 |
+
face_contour_tensor = torch.from_numpy(np.array(face_contour) / 255.0).permute(2, 0, 1).unsqueeze(0)
|
| 184 |
+
|
| 185 |
+
masked_ref = pixel_values_ref * combined_mask_tensor + face_contour_tensor * (1 - combined_mask_tensor)
|
| 186 |
+
masked_ref = masked_ref.clamp(0, 1)
|
| 187 |
+
masked_ref_np = (masked_ref.squeeze(0).permute(1, 2, 0).cpu().numpy() * 255).astype(np.uint8)
|
| 188 |
+
|
| 189 |
+
base, _ = os.path.splitext(save_path)
|
| 190 |
+
resized_img = (pixel_values_ref.squeeze(0).permute(1, 2, 0).cpu().numpy().clip(0, 1) * 255).astype(np.uint8)
|
| 191 |
+
Image.fromarray(resized_img).save(f"{base}_resize.png")
|
| 192 |
+
Image.fromarray(masked_ref_np).save(f"{base}_masked.png")
|
| 193 |
+
|
| 194 |
+
if __name__ == '__main__':
|
| 195 |
+
import fire
|
| 196 |
+
fire.Fire(generate_masked_image)
|
| 197 |
+
# python img_to_mask.py --image_path /mnt/weka/haiyang_workspace/ckpts/good_train_case/image_example/KristiNoem2-Scene-001.png --save_path /mnt/weka/haiyang_workspace/ckpts/good_train_case/image_example/KristiNoem2-Scene-001.png --crop True --union_bbox_scale 1.6
|
| 198 |
+
# python img_to_latent.py --mask_image_path ./test_case/ChrisVanHollen0-Scene-003_masked.png --save_npz_path ./test_case/ChrisVanHollen0-Scene-003_resize.npz
|
| 199 |
+
# python latent_two_video.py --npz_path ./test_case/ChrisVanHollen0-Scene-003_resize.npz --save_dir ./test_case/
|
tools/visualization_0416/latent_to_video.py
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
+
# 获取项目根目录并添加到 sys.path 最前面,确保导入正确的 utils 模块
|
| 5 |
+
_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 6 |
+
_PROJECT_ROOT = os.path.abspath(os.path.join(_SCRIPT_DIR, '..', '..'))
|
| 7 |
+
if _PROJECT_ROOT not in sys.path:
|
| 8 |
+
sys.path.insert(0, _PROJECT_ROOT)
|
| 9 |
+
|
| 10 |
+
import numpy as np
|
| 11 |
+
import torch
|
| 12 |
+
from PIL import Image
|
| 13 |
+
import torchvision.transforms as T
|
| 14 |
+
from omegaconf import OmegaConf
|
| 15 |
+
import fire
|
| 16 |
+
import imageio
|
| 17 |
+
import moviepy.editor as mp
|
| 18 |
+
from tqdm import tqdm
|
| 19 |
+
|
| 20 |
+
def init_fn(config_path, version):
|
| 21 |
+
sys.path.insert(0, f'./utils/model_{version}')
|
| 22 |
+
from utils import instantiate
|
| 23 |
+
config = OmegaConf.load(config_path)
|
| 24 |
+
module = instantiate(config.model, instantiate_module=False)
|
| 25 |
+
model = module(config=config)
|
| 26 |
+
checkpoint = torch.load(config.resume_ckpt, map_location="cpu")
|
| 27 |
+
model.load_state_dict(checkpoint["state_dict"], strict=False)
|
| 28 |
+
model.eval().to("cuda")
|
| 29 |
+
transform = T.Compose([
|
| 30 |
+
T.Resize((512, 512)),
|
| 31 |
+
T.ToTensor(),
|
| 32 |
+
T.Normalize([0.5], [0.5]),
|
| 33 |
+
])
|
| 34 |
+
return {
|
| 35 |
+
"transform": transform,
|
| 36 |
+
"flow_estimator": model.flow_estimator,
|
| 37 |
+
"face_generator": model.face_generator,
|
| 38 |
+
"face_encoder": model.face_encoder,
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
def latent_to_video(
|
| 42 |
+
npz_dir="./test_case/",
|
| 43 |
+
save_dir="./test_case/",
|
| 44 |
+
save_fps: int = 25,
|
| 45 |
+
config_path: str = './configs/head_animator_best_0416.yaml',
|
| 46 |
+
version: str = '0416',
|
| 47 |
+
):
|
| 48 |
+
# 处理相对路径:
|
| 49 |
+
# - npz_dir 和 save_dir:如果是相对路径,转换为基于项目根目录的绝对路径
|
| 50 |
+
# - config_path:如果是相对路径,转换为基于当前脚本目录(tools/visualization_0416/)的绝对路径
|
| 51 |
+
if not os.path.isabs(npz_dir):
|
| 52 |
+
npz_dir = os.path.join(_PROJECT_ROOT, npz_dir)
|
| 53 |
+
if not os.path.isabs(save_dir):
|
| 54 |
+
save_dir = os.path.join(_PROJECT_ROOT, save_dir)
|
| 55 |
+
if not os.path.isabs(config_path):
|
| 56 |
+
config_path = os.path.join(_SCRIPT_DIR, config_path)
|
| 57 |
+
|
| 58 |
+
# 规范化路径(去除多余的 . 和 ..)
|
| 59 |
+
npz_dir = os.path.normpath(npz_dir)
|
| 60 |
+
save_dir = os.path.normpath(save_dir)
|
| 61 |
+
config_path = os.path.normpath(config_path)
|
| 62 |
+
|
| 63 |
+
os.makedirs(save_dir, exist_ok=True)
|
| 64 |
+
# 只在文件名上做版本号替换,避免把路径里的 "0416" 一并替换成 "0506"
|
| 65 |
+
config_dir = os.path.dirname(config_path)
|
| 66 |
+
config_name = os.path.basename(config_path)
|
| 67 |
+
config_name = config_name.replace("0416", version)
|
| 68 |
+
config_path = os.path.join(config_dir, config_name)
|
| 69 |
+
|
| 70 |
+
# Initialize models only once
|
| 71 |
+
print("Initializing models...")
|
| 72 |
+
print(f"NPZ directory: {npz_dir}")
|
| 73 |
+
print(f"Save directory: {save_dir}")
|
| 74 |
+
ctx = init_fn(config_path, version)
|
| 75 |
+
transform = ctx["transform"]
|
| 76 |
+
flow_estimator = ctx["flow_estimator"]
|
| 77 |
+
face_generator = ctx["face_generator"]
|
| 78 |
+
face_encoder = ctx["face_encoder"]
|
| 79 |
+
|
| 80 |
+
# Get all npz files
|
| 81 |
+
if not os.path.exists(npz_dir):
|
| 82 |
+
print(f"Error: NPZ directory does not exist: {npz_dir}")
|
| 83 |
+
return
|
| 84 |
+
|
| 85 |
+
npz_files = [f for f in os.listdir(npz_dir) if f.endswith('_output.npz')]
|
| 86 |
+
print(f"Found {len(npz_files)} files to process")
|
| 87 |
+
|
| 88 |
+
# Process each file
|
| 89 |
+
for npz_file in tqdm(npz_files, desc="Processing files"):
|
| 90 |
+
if not npz_file.endswith('.npz'): continue
|
| 91 |
+
try:
|
| 92 |
+
npz_path = os.path.join(npz_dir, npz_file)
|
| 93 |
+
data = np.load(npz_path, allow_pickle=True)
|
| 94 |
+
motion_latent = torch.from_numpy(data["motion_latent"]).to("cuda").float()
|
| 95 |
+
if len(motion_latent.shape) == 3:
|
| 96 |
+
motion_latent = motion_latent.squeeze(0)
|
| 97 |
+
num_frames = motion_latent.shape[0]
|
| 98 |
+
print(f"\nProcessing {npz_file} with {num_frames} frames")
|
| 99 |
+
|
| 100 |
+
# 处理 ref_img_path - 如果是相对路径,基于项目根目录解析
|
| 101 |
+
ref_img_path = str(data["ref_img_path"])
|
| 102 |
+
if not os.path.isabs(ref_img_path):
|
| 103 |
+
ref_img_path = os.path.join(_PROJECT_ROOT, ref_img_path)
|
| 104 |
+
ref_img = Image.open(ref_img_path).convert("RGB")
|
| 105 |
+
ref_img = transform(ref_img).unsqueeze(0).to("cuda")
|
| 106 |
+
# np.save("/mnt/weka/haiyang_workspace/ckpts/good_train_case/image_example/face_encoder_input.npy", ref_img.cpu().numpy())
|
| 107 |
+
|
| 108 |
+
with torch.no_grad():
|
| 109 |
+
face_feat = face_encoder(ref_img)
|
| 110 |
+
# np.save("/mnt/weka/haiyang_workspace/ckpts/good_train_case/image_example/face_encoder_output.npy", face_feat.cpu().numpy())
|
| 111 |
+
recon_list = []
|
| 112 |
+
for i in range(0, num_frames):
|
| 113 |
+
tgt = flow_estimator(motion_latent[0:1], motion_latent[i:i+1])
|
| 114 |
+
recon_list.append(face_generator(tgt, face_feat))
|
| 115 |
+
|
| 116 |
+
recon = torch.cat(recon_list, dim=0)
|
| 117 |
+
video_np = recon.permute(0, 2, 3, 1).cpu().numpy()
|
| 118 |
+
video_np = np.clip((video_np + 1) / 2 * 255, 0, 255).astype("uint8")
|
| 119 |
+
|
| 120 |
+
video_id = str(data["video_id"])
|
| 121 |
+
# Remove leading dash to prevent FFMPEG command line parsing issues
|
| 122 |
+
if video_id.startswith('-'):
|
| 123 |
+
video_id = video_id[1:]
|
| 124 |
+
|
| 125 |
+
if num_frames == 1:
|
| 126 |
+
out_path = os.path.join(save_dir, f"{video_id}_rec.png")
|
| 127 |
+
Image.fromarray(video_np[0]).save(out_path)
|
| 128 |
+
else:
|
| 129 |
+
temp_mp4 = os.path.join(save_dir, f"{video_id}_temp.mp4")
|
| 130 |
+
final_mp4 = os.path.join(save_dir, f"{video_id}.mp4")
|
| 131 |
+
finalfinal_mp4 = os.path.join(save_dir, f"{str(data['video_id'])}.mp4")
|
| 132 |
+
with imageio.get_writer(temp_mp4, fps=save_fps) as writer:
|
| 133 |
+
for frame in video_np:
|
| 134 |
+
writer.append_data(frame)
|
| 135 |
+
# 处理 audio_path - 如果是相对路径,基于项目根目录解析
|
| 136 |
+
audio_path = str(data["audio_path"]) if "audio_path" in data.files else None
|
| 137 |
+
if audio_path and not os.path.isabs(audio_path):
|
| 138 |
+
audio_path = os.path.join(_PROJECT_ROOT, audio_path)
|
| 139 |
+
if audio_path and os.path.exists(audio_path):
|
| 140 |
+
clip = mp.VideoFileClip(temp_mp4)
|
| 141 |
+
audio = mp.AudioFileClip(audio_path)
|
| 142 |
+
clip.set_audio(audio).write_videofile(final_mp4, codec="libx264", audio_codec="aac")
|
| 143 |
+
clip.close()
|
| 144 |
+
audio.close()
|
| 145 |
+
os.remove(temp_mp4)
|
| 146 |
+
else:
|
| 147 |
+
os.rename(temp_mp4, final_mp4)
|
| 148 |
+
os.rename(final_mp4, finalfinal_mp4)
|
| 149 |
+
except Exception as e:
|
| 150 |
+
print(f"Error processing {npz_file}: {str(e)}")
|
| 151 |
+
continue
|
| 152 |
+
|
| 153 |
+
if __name__ == "__main__":
|
| 154 |
+
fire.Fire(latent_to_video)
|
| 155 |
+
# Example usage:
|
| 156 |
+
# python latent_to_video.py --npz_dir ./test_case/ --save_dir ./test_case/ --config_path ./configs/head_animator_best_0409.yaml --version 0416
|
tools/visualization_0416/latent_to_video_batch.py
ADDED
|
@@ -0,0 +1,249 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
批处理优化版本的 latent_to_video
|
| 3 |
+
相比原版逐帧处理,使用批处理加速约 10-30 倍
|
| 4 |
+
v2: 优化 GPU→CPU 传输和视频编码,使用流式处理
|
| 5 |
+
"""
|
| 6 |
+
import sys
|
| 7 |
+
import os
|
| 8 |
+
|
| 9 |
+
# 获取项目根目录并添加到 sys.path 最前面,确保导入正确的 utils 模块
|
| 10 |
+
_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 11 |
+
_PROJECT_ROOT = os.path.abspath(os.path.join(_SCRIPT_DIR, '..', '..'))
|
| 12 |
+
if _PROJECT_ROOT not in sys.path:
|
| 13 |
+
sys.path.insert(0, _PROJECT_ROOT)
|
| 14 |
+
|
| 15 |
+
import numpy as np
|
| 16 |
+
import torch
|
| 17 |
+
from PIL import Image
|
| 18 |
+
import torchvision.transforms as T
|
| 19 |
+
from omegaconf import OmegaConf
|
| 20 |
+
import fire
|
| 21 |
+
import imageio
|
| 22 |
+
import moviepy.editor as mp
|
| 23 |
+
from tqdm import tqdm
|
| 24 |
+
import time
|
| 25 |
+
import subprocess
|
| 26 |
+
import tempfile
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def init_fn(config_path, version):
|
| 30 |
+
sys.path.insert(0, f'./utils/model_{version}')
|
| 31 |
+
from utils import instantiate
|
| 32 |
+
config = OmegaConf.load(config_path)
|
| 33 |
+
module = instantiate(config.model, instantiate_module=False)
|
| 34 |
+
model = module(config=config)
|
| 35 |
+
checkpoint = torch.load(config.resume_ckpt, map_location="cpu")
|
| 36 |
+
model.load_state_dict(checkpoint["state_dict"], strict=False)
|
| 37 |
+
model.eval().to("cuda")
|
| 38 |
+
transform = T.Compose([
|
| 39 |
+
T.Resize((512, 512)),
|
| 40 |
+
T.ToTensor(),
|
| 41 |
+
T.Normalize([0.5], [0.5]),
|
| 42 |
+
])
|
| 43 |
+
return {
|
| 44 |
+
"transform": transform,
|
| 45 |
+
"flow_estimator": model.flow_estimator,
|
| 46 |
+
"face_generator": model.face_generator,
|
| 47 |
+
"face_encoder": model.face_encoder,
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def latent_to_video_batch(
|
| 52 |
+
npz_dir="./test_case/",
|
| 53 |
+
save_dir="./test_case/",
|
| 54 |
+
save_fps: int = 25,
|
| 55 |
+
config_path: str = './configs/head_animator_best_0416.yaml',
|
| 56 |
+
version: str = '0416',
|
| 57 |
+
batch_size: int = 32,
|
| 58 |
+
use_fp16: bool = True,
|
| 59 |
+
):
|
| 60 |
+
"""
|
| 61 |
+
批处理优化版本的 latent_to_video
|
| 62 |
+
|
| 63 |
+
Args:
|
| 64 |
+
npz_dir: NPZ 文件目录
|
| 65 |
+
save_dir: 输出视频目录
|
| 66 |
+
save_fps: 输出视频帧率
|
| 67 |
+
config_path: 模型配置文件路径
|
| 68 |
+
version: 模型版本
|
| 69 |
+
batch_size: 批处理大小,根据显存调整 (默认 32,显存不足可降到 16 或 8)
|
| 70 |
+
use_fp16: 是否使用混合精度加速 (默认 True)
|
| 71 |
+
"""
|
| 72 |
+
os.makedirs(save_dir, exist_ok=True)
|
| 73 |
+
config_path = config_path.replace("0416", version)
|
| 74 |
+
|
| 75 |
+
# Initialize models only once
|
| 76 |
+
print("Initializing models...")
|
| 77 |
+
ctx = init_fn(config_path, version)
|
| 78 |
+
transform = ctx["transform"]
|
| 79 |
+
flow_estimator = ctx["flow_estimator"]
|
| 80 |
+
face_generator = ctx["face_generator"]
|
| 81 |
+
face_encoder = ctx["face_encoder"]
|
| 82 |
+
|
| 83 |
+
# Get all npz files
|
| 84 |
+
npz_files = [f for f in os.listdir(npz_dir) if f.endswith('_output.npz')]
|
| 85 |
+
print(f"Found {len(npz_files)} files to process")
|
| 86 |
+
print(f"Batch size: {batch_size}, FP16: {use_fp16}")
|
| 87 |
+
|
| 88 |
+
total_frames = 0
|
| 89 |
+
total_time = 0
|
| 90 |
+
|
| 91 |
+
# Process each file
|
| 92 |
+
for npz_file in tqdm(npz_files, desc="Processing files"):
|
| 93 |
+
if not npz_file.endswith('.npz'):
|
| 94 |
+
continue
|
| 95 |
+
try:
|
| 96 |
+
npz_path = os.path.join(npz_dir, npz_file)
|
| 97 |
+
data = np.load(npz_path, allow_pickle=True)
|
| 98 |
+
motion_latent = torch.from_numpy(data["motion_latent"]).to("cuda").float()
|
| 99 |
+
if len(motion_latent.shape) == 3:
|
| 100 |
+
motion_latent = motion_latent.squeeze(0)
|
| 101 |
+
num_frames = motion_latent.shape[0]
|
| 102 |
+
print(f"\nProcessing {npz_file} with {num_frames} frames")
|
| 103 |
+
|
| 104 |
+
# 处理 ref_img_path - 如果是相对路径,基于项目根目录解析
|
| 105 |
+
ref_img_path = str(data["ref_img_path"])
|
| 106 |
+
if not os.path.isabs(ref_img_path):
|
| 107 |
+
ref_img_path = os.path.join(_PROJECT_ROOT, ref_img_path)
|
| 108 |
+
ref_img = Image.open(ref_img_path).convert("RGB")
|
| 109 |
+
ref_img = transform(ref_img).unsqueeze(0).to("cuda")
|
| 110 |
+
|
| 111 |
+
video_id = str(data["video_id"])
|
| 112 |
+
# Remove leading dash to prevent FFMPEG command line parsing issues
|
| 113 |
+
if video_id.startswith('-'):
|
| 114 |
+
video_id = video_id[1:]
|
| 115 |
+
|
| 116 |
+
# 处理 audio_path
|
| 117 |
+
audio_path = str(data["audio_path"]) if "audio_path" in data.files else None
|
| 118 |
+
if audio_path and not os.path.isabs(audio_path):
|
| 119 |
+
audio_path = os.path.join(_PROJECT_ROOT, audio_path)
|
| 120 |
+
|
| 121 |
+
start_time = time.time()
|
| 122 |
+
|
| 123 |
+
# 准备输出路径
|
| 124 |
+
temp_mp4 = os.path.join(save_dir, f"{video_id}_temp.mp4")
|
| 125 |
+
final_mp4 = os.path.join(save_dir, f"{video_id}.mp4")
|
| 126 |
+
finalfinal_mp4 = os.path.join(save_dir, f"{str(data['video_id'])}.mp4")
|
| 127 |
+
|
| 128 |
+
if num_frames == 1:
|
| 129 |
+
# 单帧情况
|
| 130 |
+
with torch.no_grad():
|
| 131 |
+
with torch.cuda.amp.autocast(enabled=use_fp16):
|
| 132 |
+
face_feat = face_encoder(ref_img)
|
| 133 |
+
tgt = flow_estimator(motion_latent[0:1], motion_latent[0:1])
|
| 134 |
+
recon = face_generator(tgt, face_feat)
|
| 135 |
+
if use_fp16:
|
| 136 |
+
recon = recon.float()
|
| 137 |
+
|
| 138 |
+
video_np = recon.permute(0, 2, 3, 1).cpu().numpy()
|
| 139 |
+
video_np = np.clip((video_np + 1) / 2 * 255, 0, 255).astype("uint8")
|
| 140 |
+
out_path = os.path.join(save_dir, f"{video_id}_rec.png")
|
| 141 |
+
Image.fromarray(video_np[0]).save(out_path)
|
| 142 |
+
else:
|
| 143 |
+
# 多帧情况 - 使用 FFmpeg pipe 流式编码
|
| 144 |
+
# 启动 FFmpeg 进程
|
| 145 |
+
ffmpeg_cmd = [
|
| 146 |
+
'ffmpeg', '-y',
|
| 147 |
+
'-f', 'rawvideo',
|
| 148 |
+
'-vcodec', 'rawvideo',
|
| 149 |
+
'-s', '512x512',
|
| 150 |
+
'-pix_fmt', 'rgb24',
|
| 151 |
+
'-r', str(save_fps),
|
| 152 |
+
'-i', '-',
|
| 153 |
+
'-c:v', 'libx264',
|
| 154 |
+
'-preset', 'fast',
|
| 155 |
+
'-crf', '18',
|
| 156 |
+
'-pix_fmt', 'yuv420p',
|
| 157 |
+
temp_mp4
|
| 158 |
+
]
|
| 159 |
+
|
| 160 |
+
ffmpeg_process = subprocess.Popen(
|
| 161 |
+
ffmpeg_cmd,
|
| 162 |
+
stdin=subprocess.PIPE,
|
| 163 |
+
stdout=subprocess.DEVNULL,
|
| 164 |
+
stderr=subprocess.DEVNULL
|
| 165 |
+
)
|
| 166 |
+
|
| 167 |
+
with torch.no_grad():
|
| 168 |
+
with torch.cuda.amp.autocast(enabled=use_fp16):
|
| 169 |
+
face_feat = face_encoder(ref_img) # (1, 32, 16, 64, 64)
|
| 170 |
+
ref_latent = motion_latent[0:1] # 参考帧的 latent
|
| 171 |
+
|
| 172 |
+
# 批处理推理 + 流式写入
|
| 173 |
+
for i in range(0, num_frames, batch_size):
|
| 174 |
+
batch_end = min(i + batch_size, num_frames)
|
| 175 |
+
current_batch_size = batch_end - i
|
| 176 |
+
|
| 177 |
+
# 获取当前批次的 motion latent
|
| 178 |
+
batch_motion = motion_latent[i:batch_end]
|
| 179 |
+
|
| 180 |
+
# 扩展参考帧 latent 到批次大小
|
| 181 |
+
ref_latent_expanded = ref_latent.expand(current_batch_size, -1)
|
| 182 |
+
|
| 183 |
+
# 扩展 face_feat 到批次大小
|
| 184 |
+
face_feat_expanded = face_feat.expand(current_batch_size, -1, -1, -1, -1)
|
| 185 |
+
|
| 186 |
+
# 批量计算 flow
|
| 187 |
+
tgt = flow_estimator(ref_latent_expanded, batch_motion)
|
| 188 |
+
|
| 189 |
+
# 批量生成图像
|
| 190 |
+
recon = face_generator(tgt, face_feat_expanded)
|
| 191 |
+
|
| 192 |
+
# 转换并写入 - 直接在 GPU 上做归一化
|
| 193 |
+
# (batch, 3, 512, 512) -> (batch, 512, 512, 3)
|
| 194 |
+
recon = recon.float()
|
| 195 |
+
recon = (recon + 1) / 2 * 255
|
| 196 |
+
recon = recon.clamp(0, 255).to(torch.uint8)
|
| 197 |
+
recon = recon.permute(0, 2, 3, 1).contiguous()
|
| 198 |
+
|
| 199 |
+
# 分块传输到 CPU 并写入
|
| 200 |
+
frames_np = recon.cpu().numpy()
|
| 201 |
+
ffmpeg_process.stdin.write(frames_np.tobytes())
|
| 202 |
+
|
| 203 |
+
# 关闭 FFmpeg
|
| 204 |
+
ffmpeg_process.stdin.close()
|
| 205 |
+
ffmpeg_process.wait()
|
| 206 |
+
|
| 207 |
+
elapsed = time.time() - start_time
|
| 208 |
+
total_frames += num_frames
|
| 209 |
+
total_time += elapsed
|
| 210 |
+
fps = num_frames / elapsed
|
| 211 |
+
print(f" Rendered + encoded {num_frames} frames in {elapsed:.2f}s ({fps:.1f} fps)")
|
| 212 |
+
|
| 213 |
+
# 合并音频
|
| 214 |
+
if audio_path and os.path.exists(audio_path):
|
| 215 |
+
# 使用 FFmpeg 直接合并音频(比 moviepy 快很多)
|
| 216 |
+
final_with_audio = os.path.join(save_dir, f"{video_id}_with_audio.mp4")
|
| 217 |
+
ffmpeg_audio_cmd = [
|
| 218 |
+
'ffmpeg', '-y',
|
| 219 |
+
'-i', temp_mp4,
|
| 220 |
+
'-i', audio_path,
|
| 221 |
+
'-c:v', 'copy',
|
| 222 |
+
'-c:a', 'aac',
|
| 223 |
+
'-shortest',
|
| 224 |
+
final_with_audio
|
| 225 |
+
]
|
| 226 |
+
subprocess.run(ffmpeg_audio_cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
| 227 |
+
os.remove(temp_mp4)
|
| 228 |
+
os.rename(final_with_audio, finalfinal_mp4)
|
| 229 |
+
else:
|
| 230 |
+
os.rename(temp_mp4, finalfinal_mp4)
|
| 231 |
+
|
| 232 |
+
except Exception as e:
|
| 233 |
+
import traceback
|
| 234 |
+
print(f"Error processing {npz_file}: {str(e)}")
|
| 235 |
+
traceback.print_exc()
|
| 236 |
+
continue
|
| 237 |
+
|
| 238 |
+
# 打印总体统计
|
| 239 |
+
if total_time > 0:
|
| 240 |
+
print(f"\n{'='*50}")
|
| 241 |
+
print(f"总计: {total_frames} 帧, {total_time:.2f} ��")
|
| 242 |
+
print(f"平均渲染速度: {total_frames / total_time:.1f} fps")
|
| 243 |
+
print(f"{'='*50}")
|
| 244 |
+
|
| 245 |
+
|
| 246 |
+
if __name__ == "__main__":
|
| 247 |
+
fire.Fire(latent_to_video_batch)
|
| 248 |
+
# Example usage:
|
| 249 |
+
# python latent_to_video_batch.py --npz_dir ./test_case/ --save_dir ./test_case/ --batch_size 32 --use_fp16 True
|
tools/visualization_0416/utils/__init__.py
ADDED
|
File without changes
|
tools/visualization_0416/utils/face_detector.py
ADDED
|
@@ -0,0 +1,624 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import mediapipe as mp
|
| 2 |
+
from mediapipe import solutions
|
| 3 |
+
from mediapipe.framework.formats import landmark_pb2
|
| 4 |
+
import numpy as np
|
| 5 |
+
import cv2
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def convert_bbox_to_square_bbox(bbox, max_h, max_w, scale=1.0):
|
| 9 |
+
# Calculate width, height, and max_size of the bounding box
|
| 10 |
+
width = bbox[1][0] - bbox[0][0]
|
| 11 |
+
height = bbox[1][1] - bbox[0][1]
|
| 12 |
+
max_size = max(width, height) * scale
|
| 13 |
+
|
| 14 |
+
# Calculate center of the bounding box
|
| 15 |
+
center_x = (bbox[0][0] + bbox[1][0]) / 2
|
| 16 |
+
center_y = (bbox[0][1] + bbox[1][1]) / 2
|
| 17 |
+
|
| 18 |
+
# Calculate the left-up and right-bottom corners of the square bounding box
|
| 19 |
+
half_size = max_size / 2
|
| 20 |
+
left_top = [int(center_x - half_size), int(center_y - half_size)]
|
| 21 |
+
right_bottom = [int(center_x + half_size), int(center_y + half_size)]
|
| 22 |
+
|
| 23 |
+
# Ensure the square is within image bounds
|
| 24 |
+
left_top[0] = max(0, left_top[0])
|
| 25 |
+
left_top[1] = max(0, left_top[1])
|
| 26 |
+
right_bottom[0] = min(max_w, right_bottom[0])
|
| 27 |
+
right_bottom[1] = min(max_h, right_bottom[1])
|
| 28 |
+
|
| 29 |
+
# Return the new bounding box as a list of top-left and bottom-right coordinates
|
| 30 |
+
return [left_top[0], left_top[1], right_bottom[0], right_bottom[1]]
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def draw_landmarks_on_image(rgb_image, detection_result):
|
| 34 |
+
face_landmarks_list = detection_result.face_landmarks
|
| 35 |
+
annotated_image = np.copy(rgb_image)
|
| 36 |
+
|
| 37 |
+
# Loop through the detected faces to visualize.
|
| 38 |
+
for idx in range(len(face_landmarks_list)):
|
| 39 |
+
face_landmarks = face_landmarks_list[idx]
|
| 40 |
+
|
| 41 |
+
# Draw the face landmarks.
|
| 42 |
+
face_landmarks_proto = landmark_pb2.NormalizedLandmarkList()
|
| 43 |
+
face_landmarks_proto.landmark.extend(
|
| 44 |
+
[
|
| 45 |
+
landmark_pb2.NormalizedLandmark(
|
| 46 |
+
x=landmark.x, y=landmark.y, z=landmark.z
|
| 47 |
+
)
|
| 48 |
+
for landmark in face_landmarks
|
| 49 |
+
]
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
solutions.drawing_utils.draw_landmarks(
|
| 53 |
+
image=annotated_image,
|
| 54 |
+
landmark_list=face_landmarks_proto,
|
| 55 |
+
connections=mp.solutions.face_mesh.FACEMESH_TESSELATION,
|
| 56 |
+
landmark_drawing_spec=None,
|
| 57 |
+
connection_drawing_spec=mp.solutions.drawing_styles.get_default_face_mesh_tesselation_style(),
|
| 58 |
+
)
|
| 59 |
+
solutions.drawing_utils.draw_landmarks(
|
| 60 |
+
image=annotated_image,
|
| 61 |
+
landmark_list=face_landmarks_proto,
|
| 62 |
+
connections=mp.solutions.face_mesh.FACEMESH_CONTOURS,
|
| 63 |
+
landmark_drawing_spec=None,
|
| 64 |
+
connection_drawing_spec=mp.solutions.drawing_styles.get_default_face_mesh_contours_style(),
|
| 65 |
+
)
|
| 66 |
+
solutions.drawing_utils.draw_landmarks(
|
| 67 |
+
image=annotated_image,
|
| 68 |
+
landmark_list=face_landmarks_proto,
|
| 69 |
+
connections=mp.solutions.face_mesh.FACEMESH_IRISES,
|
| 70 |
+
landmark_drawing_spec=None,
|
| 71 |
+
connection_drawing_spec=mp.solutions.drawing_styles.get_default_face_mesh_iris_connections_style(),
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
return annotated_image
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
class FaceDetector:
|
| 78 |
+
def __init__(self, mediapipe_model_asset_path, delegate=1, face_detection_confidence=0.5, num_faces=5):
|
| 79 |
+
# Create a face landmarker instance with the video mode:
|
| 80 |
+
options = mp.tasks.vision.FaceLandmarkerOptions(
|
| 81 |
+
base_options=mp.tasks.BaseOptions(
|
| 82 |
+
model_asset_path=mediapipe_model_asset_path,
|
| 83 |
+
# delegate=mp.tasks.BaseOptions.Delegate.GPU,
|
| 84 |
+
# TODO: why does the gpu version not work in docker???
|
| 85 |
+
delegate=delegate,
|
| 86 |
+
),
|
| 87 |
+
running_mode=mp.tasks.vision.RunningMode.IMAGE,
|
| 88 |
+
num_faces=num_faces,
|
| 89 |
+
output_face_blendshapes=True,
|
| 90 |
+
output_facial_transformation_matrixes=True,
|
| 91 |
+
min_face_detection_confidence=face_detection_confidence,
|
| 92 |
+
min_face_presence_confidence=face_detection_confidence,
|
| 93 |
+
min_tracking_confidence=face_detection_confidence,
|
| 94 |
+
)
|
| 95 |
+
self.detector = mp.tasks.vision.FaceLandmarker.create_from_options(options)
|
| 96 |
+
|
| 97 |
+
def get_one_face_xy_rotation_and_keypoints(self, image, mouth_bbox_scale = 1.2, eye_bbox_scale = 1.5, annotate_image: bool = False, save_vis=False):
|
| 98 |
+
mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=image)
|
| 99 |
+
|
| 100 |
+
# get facial rotation
|
| 101 |
+
results = self.detector.detect(mp_image)
|
| 102 |
+
max_h, max_w = image.shape[:2]
|
| 103 |
+
|
| 104 |
+
if annotate_image:
|
| 105 |
+
annotated_image = draw_landmarks_on_image(image, results)
|
| 106 |
+
else:
|
| 107 |
+
annotated_image = None
|
| 108 |
+
|
| 109 |
+
all_x = []
|
| 110 |
+
all_y = []
|
| 111 |
+
all_orientation = []
|
| 112 |
+
all_keypoints = []
|
| 113 |
+
all_bounding_box = []
|
| 114 |
+
all_mouth_bounding_box = []
|
| 115 |
+
all_eye_bounding_box = []
|
| 116 |
+
all_face_contour = []
|
| 117 |
+
all_eyeball = []
|
| 118 |
+
all_eyeball_mask = []
|
| 119 |
+
all_blendshapes = []
|
| 120 |
+
all_mouth_p = []
|
| 121 |
+
all_nose_p = []
|
| 122 |
+
all_left_eye_p = []
|
| 123 |
+
all_right_eye_p = []
|
| 124 |
+
num_faces = len(results.face_landmarks)
|
| 125 |
+
|
| 126 |
+
for face_blendshapes in results.face_blendshapes:
|
| 127 |
+
blendshapes = [item.score for item in face_blendshapes]
|
| 128 |
+
all_blendshapes.append(blendshapes)
|
| 129 |
+
|
| 130 |
+
all_facial_transformation_matrices = results.facial_transformation_matrixes
|
| 131 |
+
|
| 132 |
+
for face_landmarks in results.face_landmarks:
|
| 133 |
+
keypoints = []
|
| 134 |
+
bounding_box = []
|
| 135 |
+
|
| 136 |
+
h, w = image.shape[0], image.shape[1]
|
| 137 |
+
cx_min, cy_min = w, h
|
| 138 |
+
cx_max, cy_max = 0, 0
|
| 139 |
+
for idx, lm in enumerate(face_landmarks):
|
| 140 |
+
# Clip landmarks if they go off the image
|
| 141 |
+
cx, cy = int(np.clip(lm.x, 0, 1) * w), int(np.clip(lm.y, 0, 1) * h)
|
| 142 |
+
|
| 143 |
+
if cx < cx_min:
|
| 144 |
+
cx_min = cx
|
| 145 |
+
if cy < cy_min:
|
| 146 |
+
cy_min = cy
|
| 147 |
+
if cx > cx_max:
|
| 148 |
+
cx_max = cx
|
| 149 |
+
if cy > cy_max:
|
| 150 |
+
cy_max = cy
|
| 151 |
+
|
| 152 |
+
keypoints.append((lm.x, lm.y, lm.z))
|
| 153 |
+
|
| 154 |
+
if idx == 137:
|
| 155 |
+
right_cheek = (lm.x, lm.y, lm.z)
|
| 156 |
+
if idx == 366:
|
| 157 |
+
left_cheek = (lm.x, lm.y, lm.z)
|
| 158 |
+
if idx == 4:
|
| 159 |
+
nose = (lm.x, lm.y, lm.z)
|
| 160 |
+
|
| 161 |
+
# get vector from middle of face to tip of nose
|
| 162 |
+
face_middle = (
|
| 163 |
+
(right_cheek[0] + left_cheek[0]) / 2.0,
|
| 164 |
+
(right_cheek[1] + left_cheek[1]) / 2.0,
|
| 165 |
+
)
|
| 166 |
+
|
| 167 |
+
x = nose[0] - face_middle[0]
|
| 168 |
+
y = nose[1] - face_middle[1]
|
| 169 |
+
|
| 170 |
+
if x > 0.15:
|
| 171 |
+
orientation = "left"
|
| 172 |
+
elif x < -0.15:
|
| 173 |
+
orientation = "right"
|
| 174 |
+
else:
|
| 175 |
+
orientation = "forward"
|
| 176 |
+
|
| 177 |
+
bounding_box = [(cx_min, cy_min), (cx_max, cy_max)]
|
| 178 |
+
|
| 179 |
+
all_keypoints.append(keypoints)
|
| 180 |
+
all_bounding_box.append(bounding_box)
|
| 181 |
+
all_x.append(x)
|
| 182 |
+
all_y.append(y)
|
| 183 |
+
all_orientation.append(orientation)
|
| 184 |
+
|
| 185 |
+
# Get mouth bounding box (landmarks 13-17 and 308-312)
|
| 186 |
+
mouth_landmarks = [
|
| 187 |
+
61,
|
| 188 |
+
146,
|
| 189 |
+
146,
|
| 190 |
+
91,
|
| 191 |
+
91,
|
| 192 |
+
181,
|
| 193 |
+
181,
|
| 194 |
+
84,
|
| 195 |
+
84,
|
| 196 |
+
17,
|
| 197 |
+
17,
|
| 198 |
+
314,
|
| 199 |
+
314,
|
| 200 |
+
405,
|
| 201 |
+
405,
|
| 202 |
+
321,
|
| 203 |
+
321,
|
| 204 |
+
375,
|
| 205 |
+
375,
|
| 206 |
+
291,
|
| 207 |
+
61,
|
| 208 |
+
185,
|
| 209 |
+
185,
|
| 210 |
+
40,
|
| 211 |
+
40,
|
| 212 |
+
39,
|
| 213 |
+
39,
|
| 214 |
+
37,
|
| 215 |
+
37,
|
| 216 |
+
0,
|
| 217 |
+
0,
|
| 218 |
+
267,
|
| 219 |
+
267,
|
| 220 |
+
269,
|
| 221 |
+
269,
|
| 222 |
+
270,
|
| 223 |
+
270,
|
| 224 |
+
409,
|
| 225 |
+
409,
|
| 226 |
+
291,
|
| 227 |
+
78,
|
| 228 |
+
95,
|
| 229 |
+
95,
|
| 230 |
+
88,
|
| 231 |
+
88,
|
| 232 |
+
178,
|
| 233 |
+
178,
|
| 234 |
+
87,
|
| 235 |
+
87,
|
| 236 |
+
14,
|
| 237 |
+
14,
|
| 238 |
+
317,
|
| 239 |
+
317,
|
| 240 |
+
402,
|
| 241 |
+
402,
|
| 242 |
+
318,
|
| 243 |
+
318,
|
| 244 |
+
324,
|
| 245 |
+
324,
|
| 246 |
+
308,
|
| 247 |
+
78,
|
| 248 |
+
191,
|
| 249 |
+
191,
|
| 250 |
+
80,
|
| 251 |
+
80,
|
| 252 |
+
81,
|
| 253 |
+
81,
|
| 254 |
+
82,
|
| 255 |
+
82,
|
| 256 |
+
13,
|
| 257 |
+
13,
|
| 258 |
+
312,
|
| 259 |
+
312,
|
| 260 |
+
311,
|
| 261 |
+
311,
|
| 262 |
+
310,
|
| 263 |
+
310,
|
| 264 |
+
415,
|
| 265 |
+
415,
|
| 266 |
+
308,
|
| 267 |
+
]
|
| 268 |
+
# mouth_landmarks = [13, 14, 15, 16, 17, 308, 309, 310, 311, 312]
|
| 269 |
+
mouth_x = [int(np.clip(face_landmarks[idx].x, 0, 1) * w) for idx in mouth_landmarks]
|
| 270 |
+
mouth_y = [int(np.clip(face_landmarks[idx].y, 0, 1) * h) for idx in mouth_landmarks]
|
| 271 |
+
mouth_bbox = [(min(mouth_x), min(mouth_y)), (max(mouth_x), max(mouth_y))]
|
| 272 |
+
mouth_p = np.array([(mouth_bbox[0][0] + mouth_bbox[1][0]) / 2, (mouth_bbox[1][0] + mouth_bbox[1][1]) / 2])
|
| 273 |
+
mouth_bbox = convert_bbox_to_square_bbox(mouth_bbox, max_h, max_w, scale=mouth_bbox_scale)
|
| 274 |
+
|
| 275 |
+
nose_landmarks = [48, 115, 220, 45, 4, 275, 440, 344, 278]
|
| 276 |
+
nose_x = [int(np.clip(face_landmarks[idx].x, 0, 1) * w) for idx in nose_landmarks]
|
| 277 |
+
nose_y = [int(np.clip(face_landmarks[idx].y, 0, 1) * h) for idx in nose_landmarks]
|
| 278 |
+
nose_bbox = [(min(nose_x), min(nose_y)), (max(nose_x), max(nose_y))]
|
| 279 |
+
nose_p = np.array([(nose_bbox[0][0] + nose_bbox[1][0]) / 2, (nose_bbox[1][0] + nose_bbox[1][1]) / 2])
|
| 280 |
+
|
| 281 |
+
# width = mouth_bbox[1][0] - mouth_bbox[0][0]
|
| 282 |
+
# height = mouth_bbox[1][1] - mouth_bbox[0][1]
|
| 283 |
+
# max_size = max(width, height) * 1.2
|
| 284 |
+
# center_x = (mouth_bbox[0][0] + mouth_bbox[1][0]) / 2
|
| 285 |
+
# center_y = (mouth_bbox[0][1] + mouth_bbox[1][1]) / 2
|
| 286 |
+
# left_up = (int(center_x - max_size/2), int(center_y - max_size/2))
|
| 287 |
+
# right_bottom = (int(center_x + max_size/2), int(center_y + max_size/2))
|
| 288 |
+
# mouth_bbox = [left_up, right_bottom]
|
| 289 |
+
|
| 290 |
+
all_mouth_bounding_box.append(mouth_bbox)
|
| 291 |
+
|
| 292 |
+
# Get eye bounding boxes (left eye: landmarks 33-133, right eye: landmarks 362-263)
|
| 293 |
+
left_eye_landmarks = [362, 398, 384, 385, 386, 387, 388, 466, 263, 249, 390, 373, 374, 380, 381, 382]
|
| 294 |
+
right_eye_landmarks = [33, 246, 161, 160, 159, 158, 157, 173, 133, 155, 154, 153, 145, 144, 163, 7]
|
| 295 |
+
|
| 296 |
+
left_eye_x = [int(np.clip(face_landmarks[idx].x, 0, 1) * w) for idx in left_eye_landmarks]
|
| 297 |
+
left_eye_y = [int(np.clip(face_landmarks[idx].y, 0, 1) * h) for idx in left_eye_landmarks]
|
| 298 |
+
left_eye_bbox = [(min(left_eye_x), min(left_eye_y)), (max(left_eye_x), max(left_eye_y))]
|
| 299 |
+
left_size = max(left_eye_y) - min(left_eye_y)
|
| 300 |
+
left_eye_p = np.array([(left_eye_bbox[0][0] + left_eye_bbox[1][0]) / 2, (left_eye_bbox[1][0] + left_eye_bbox[1][1]) / 2])
|
| 301 |
+
left_eye_bbox = convert_bbox_to_square_bbox(left_eye_bbox, max_h, max_w, scale=eye_bbox_scale)
|
| 302 |
+
|
| 303 |
+
right_eye_x = [int(np.clip(face_landmarks[idx].x, 0, 1) * w) for idx in right_eye_landmarks]
|
| 304 |
+
right_eye_y = [int(np.clip(face_landmarks[idx].y, 0, 1) * h) for idx in right_eye_landmarks]
|
| 305 |
+
right_eye_bbox = [(min(right_eye_x), min(right_eye_y)), (max(right_eye_x), max(right_eye_y))]
|
| 306 |
+
right_size = max(right_eye_y) - min(right_eye_y)
|
| 307 |
+
right_eye_p = np.array([(right_eye_bbox[0][0] + right_eye_bbox[1][0]) / 2, (right_eye_bbox[1][0] + right_eye_bbox[1][1]) / 2])
|
| 308 |
+
right_eye_bbox = convert_bbox_to_square_bbox(right_eye_bbox, max_h, max_w, scale=eye_bbox_scale)
|
| 309 |
+
|
| 310 |
+
eye_bbox = {"left_eye": left_eye_bbox, "right_eye": right_eye_bbox}
|
| 311 |
+
|
| 312 |
+
all_eye_bounding_box.append(eye_bbox)
|
| 313 |
+
|
| 314 |
+
face_contour = np.zeros_like(image)
|
| 315 |
+
for landmark_id, landmark in enumerate(face_landmarks):
|
| 316 |
+
cx, cy = int(landmark.x * w), int(landmark.y * h)
|
| 317 |
+
if cy >= max_h or cx >= max_w: continue
|
| 318 |
+
if cy < 0 or cx < 0: continue
|
| 319 |
+
face_contour[cy, cx] = (255, 255, 255)
|
| 320 |
+
|
| 321 |
+
eyeball = np.zeros_like(image)
|
| 322 |
+
for landmark_id, landmark in enumerate(face_landmarks):
|
| 323 |
+
cx, cy = int(landmark.x * w), int(landmark.y * h)
|
| 324 |
+
if landmark_id not in [468, 473]: continue
|
| 325 |
+
if cy >= max_h or cx >= max_w: continue
|
| 326 |
+
if cy < 0 or cx < 0: continue
|
| 327 |
+
radius = int(left_size // 3) if landmark_id == 468 else int(right_size // 3)
|
| 328 |
+
cv2.circle(eyeball, (cx, cy), radius=radius, color=(255, 0, 0), thickness=-1)
|
| 329 |
+
eyeball_mask = (eyeball.sum(axis=2) != 0)[:, :, None]
|
| 330 |
+
|
| 331 |
+
all_eyeball.append(eyeball)
|
| 332 |
+
all_eyeball_mask.append(eyeball_mask)
|
| 333 |
+
all_face_contour.append(face_contour)
|
| 334 |
+
all_mouth_p.append(mouth_p)
|
| 335 |
+
all_nose_p.append(nose_p)
|
| 336 |
+
all_left_eye_p.append(left_eye_p)
|
| 337 |
+
all_right_eye_p.append(right_eye_p)
|
| 338 |
+
|
| 339 |
+
if save_vis:
|
| 340 |
+
x_min, y_min, x_max, y_max = mouth_bbox
|
| 341 |
+
cv2.rectangle(image, (x_min, y_min), (x_max, y_max), (0, 0, 255), 2)
|
| 342 |
+
|
| 343 |
+
for eye_key, bbox in eye_bbox.items():
|
| 344 |
+
x_min, y_min, x_max, y_max = bbox
|
| 345 |
+
color = (0, 0, 255)
|
| 346 |
+
cv2.rectangle(image, (x_min, y_min), (x_max, y_max), color, 2)
|
| 347 |
+
|
| 348 |
+
for landmark_id, landmark in enumerate(face_landmarks):
|
| 349 |
+
cx, cy = int(landmark.x * w), int(landmark.y * h)
|
| 350 |
+
circle_size = 2
|
| 351 |
+
if landmark_id in mouth_landmarks:
|
| 352 |
+
cv2.circle(image, (cx, cy), circle_size, (0, 0, 255), -1)
|
| 353 |
+
elif landmark_id in left_eye_landmarks+right_eye_landmarks:
|
| 354 |
+
cv2.circle(image, (cx, cy), circle_size, (0, 255, 0), -1)
|
| 355 |
+
else:
|
| 356 |
+
cv2.circle(image, (cx, cy), circle_size, (255, 255, 255), -1)
|
| 357 |
+
cv2.imwrite('image_detect.png', image[:,:,::-1])
|
| 358 |
+
# import pdb; pdb.set_trace()
|
| 359 |
+
|
| 360 |
+
return (
|
| 361 |
+
all_x,
|
| 362 |
+
all_y,
|
| 363 |
+
all_orientation,
|
| 364 |
+
num_faces,
|
| 365 |
+
all_keypoints,
|
| 366 |
+
all_bounding_box,
|
| 367 |
+
all_mouth_bounding_box,
|
| 368 |
+
all_eye_bounding_box,
|
| 369 |
+
all_face_contour,
|
| 370 |
+
all_blendshapes,
|
| 371 |
+
all_facial_transformation_matrices,
|
| 372 |
+
annotated_image,
|
| 373 |
+
all_mouth_p, # 12
|
| 374 |
+
all_nose_p, # 13
|
| 375 |
+
all_left_eye_p, # 14
|
| 376 |
+
all_right_eye_p, # 15
|
| 377 |
+
all_eyeball, # 16
|
| 378 |
+
all_eyeball_mask, # 17
|
| 379 |
+
)
|
| 380 |
+
|
| 381 |
+
def get_face_xy_rotation_and_keypoints(self, image, mouth_bbox_scale = 1.2, eye_bbox_scale = 1.5, annotate_image: bool = False, save_vis=False):
|
| 382 |
+
mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=image)
|
| 383 |
+
|
| 384 |
+
# get facial rotation
|
| 385 |
+
results = self.detector.detect(mp_image)
|
| 386 |
+
max_h, max_w = image.shape[:2]
|
| 387 |
+
|
| 388 |
+
if annotate_image:
|
| 389 |
+
annotated_image = draw_landmarks_on_image(image, results)
|
| 390 |
+
else:
|
| 391 |
+
annotated_image = None
|
| 392 |
+
|
| 393 |
+
all_x = []
|
| 394 |
+
all_y = []
|
| 395 |
+
all_orientation = []
|
| 396 |
+
all_keypoints = []
|
| 397 |
+
all_bounding_box = []
|
| 398 |
+
all_mouth_bounding_box = []
|
| 399 |
+
all_eye_bounding_box = []
|
| 400 |
+
all_face_contour = []
|
| 401 |
+
all_blendshapes = []
|
| 402 |
+
num_faces = len(results.face_landmarks)
|
| 403 |
+
|
| 404 |
+
for face_blendshapes in results.face_blendshapes:
|
| 405 |
+
blendshapes = [item.score for item in face_blendshapes]
|
| 406 |
+
all_blendshapes.append(blendshapes)
|
| 407 |
+
|
| 408 |
+
all_facial_transformation_matrices = results.facial_transformation_matrixes
|
| 409 |
+
|
| 410 |
+
for face_landmarks in results.face_landmarks:
|
| 411 |
+
keypoints = []
|
| 412 |
+
bounding_box = []
|
| 413 |
+
|
| 414 |
+
h, w = image.shape[0], image.shape[1]
|
| 415 |
+
cx_min, cy_min = w, h
|
| 416 |
+
cx_max, cy_max = 0, 0
|
| 417 |
+
for idx, lm in enumerate(face_landmarks):
|
| 418 |
+
# Clip landmarks if they go off the image
|
| 419 |
+
cx, cy = int(np.clip(lm.x, 0, 1) * w), int(np.clip(lm.y, 0, 1) * h)
|
| 420 |
+
|
| 421 |
+
if cx < cx_min:
|
| 422 |
+
cx_min = cx
|
| 423 |
+
if cy < cy_min:
|
| 424 |
+
cy_min = cy
|
| 425 |
+
if cx > cx_max:
|
| 426 |
+
cx_max = cx
|
| 427 |
+
if cy > cy_max:
|
| 428 |
+
cy_max = cy
|
| 429 |
+
|
| 430 |
+
keypoints.append((lm.x, lm.y, lm.z))
|
| 431 |
+
|
| 432 |
+
if idx == 137:
|
| 433 |
+
right_cheek = (lm.x, lm.y, lm.z)
|
| 434 |
+
if idx == 366:
|
| 435 |
+
left_cheek = (lm.x, lm.y, lm.z)
|
| 436 |
+
if idx == 4:
|
| 437 |
+
nose = (lm.x, lm.y, lm.z)
|
| 438 |
+
|
| 439 |
+
# get vector from middle of face to tip of nose
|
| 440 |
+
face_middle = (
|
| 441 |
+
(right_cheek[0] + left_cheek[0]) / 2.0,
|
| 442 |
+
(right_cheek[1] + left_cheek[1]) / 2.0,
|
| 443 |
+
)
|
| 444 |
+
|
| 445 |
+
x = nose[0] - face_middle[0]
|
| 446 |
+
y = nose[1] - face_middle[1]
|
| 447 |
+
|
| 448 |
+
if x > 0.15:
|
| 449 |
+
orientation = "left"
|
| 450 |
+
elif x < -0.15:
|
| 451 |
+
orientation = "right"
|
| 452 |
+
else:
|
| 453 |
+
orientation = "forward"
|
| 454 |
+
|
| 455 |
+
bounding_box = [(cx_min, cy_min), (cx_max, cy_max)]
|
| 456 |
+
|
| 457 |
+
all_keypoints.append(keypoints)
|
| 458 |
+
all_bounding_box.append(bounding_box)
|
| 459 |
+
all_x.append(x)
|
| 460 |
+
all_y.append(y)
|
| 461 |
+
all_orientation.append(orientation)
|
| 462 |
+
|
| 463 |
+
# Get mouth bounding box (landmarks 13-17 and 308-312)
|
| 464 |
+
mouth_landmarks = [
|
| 465 |
+
61,
|
| 466 |
+
146,
|
| 467 |
+
146,
|
| 468 |
+
91,
|
| 469 |
+
91,
|
| 470 |
+
181,
|
| 471 |
+
181,
|
| 472 |
+
84,
|
| 473 |
+
84,
|
| 474 |
+
17,
|
| 475 |
+
17,
|
| 476 |
+
314,
|
| 477 |
+
314,
|
| 478 |
+
405,
|
| 479 |
+
405,
|
| 480 |
+
321,
|
| 481 |
+
321,
|
| 482 |
+
375,
|
| 483 |
+
375,
|
| 484 |
+
291,
|
| 485 |
+
61,
|
| 486 |
+
185,
|
| 487 |
+
185,
|
| 488 |
+
40,
|
| 489 |
+
40,
|
| 490 |
+
39,
|
| 491 |
+
39,
|
| 492 |
+
37,
|
| 493 |
+
37,
|
| 494 |
+
0,
|
| 495 |
+
0,
|
| 496 |
+
267,
|
| 497 |
+
267,
|
| 498 |
+
269,
|
| 499 |
+
269,
|
| 500 |
+
270,
|
| 501 |
+
270,
|
| 502 |
+
409,
|
| 503 |
+
409,
|
| 504 |
+
291,
|
| 505 |
+
78,
|
| 506 |
+
95,
|
| 507 |
+
95,
|
| 508 |
+
88,
|
| 509 |
+
88,
|
| 510 |
+
178,
|
| 511 |
+
178,
|
| 512 |
+
87,
|
| 513 |
+
87,
|
| 514 |
+
14,
|
| 515 |
+
14,
|
| 516 |
+
317,
|
| 517 |
+
317,
|
| 518 |
+
402,
|
| 519 |
+
402,
|
| 520 |
+
318,
|
| 521 |
+
318,
|
| 522 |
+
324,
|
| 523 |
+
324,
|
| 524 |
+
308,
|
| 525 |
+
78,
|
| 526 |
+
191,
|
| 527 |
+
191,
|
| 528 |
+
80,
|
| 529 |
+
80,
|
| 530 |
+
81,
|
| 531 |
+
81,
|
| 532 |
+
82,
|
| 533 |
+
82,
|
| 534 |
+
13,
|
| 535 |
+
13,
|
| 536 |
+
312,
|
| 537 |
+
312,
|
| 538 |
+
311,
|
| 539 |
+
311,
|
| 540 |
+
310,
|
| 541 |
+
310,
|
| 542 |
+
415,
|
| 543 |
+
415,
|
| 544 |
+
308,
|
| 545 |
+
]
|
| 546 |
+
# mouth_landmarks = [13, 14, 15, 16, 17, 308, 309, 310, 311, 312]
|
| 547 |
+
mouth_x = [int(np.clip(face_landmarks[idx].x, 0, 1) * w) for idx in mouth_landmarks]
|
| 548 |
+
mouth_y = [int(np.clip(face_landmarks[idx].y, 0, 1) * h) for idx in mouth_landmarks]
|
| 549 |
+
mouth_bbox = [(min(mouth_x), min(mouth_y)), (max(mouth_x), max(mouth_y))]
|
| 550 |
+
mouth_bbox = convert_bbox_to_square_bbox(mouth_bbox, max_h, max_w, scale=mouth_bbox_scale)
|
| 551 |
+
|
| 552 |
+
# width = mouth_bbox[1][0] - mouth_bbox[0][0]
|
| 553 |
+
# height = mouth_bbox[1][1] - mouth_bbox[0][1]
|
| 554 |
+
# max_size = max(width, height) * 1.2
|
| 555 |
+
# center_x = (mouth_bbox[0][0] + mouth_bbox[1][0]) / 2
|
| 556 |
+
# center_y = (mouth_bbox[0][1] + mouth_bbox[1][1]) / 2
|
| 557 |
+
# left_up = (int(center_x - max_size/2), int(center_y - max_size/2))
|
| 558 |
+
# right_bottom = (int(center_x + max_size/2), int(center_y + max_size/2))
|
| 559 |
+
# mouth_bbox = [left_up, right_bottom]
|
| 560 |
+
|
| 561 |
+
all_mouth_bounding_box.append(mouth_bbox)
|
| 562 |
+
|
| 563 |
+
# Get eye bounding boxes (left eye: landmarks 33-133, right eye: landmarks 362-263)
|
| 564 |
+
left_eye_landmarks = [362, 398, 384, 385, 386, 387, 388, 466, 263, 249, 390, 373, 374, 380, 381, 382]
|
| 565 |
+
right_eye_landmarks = [33, 246, 161, 160, 159, 158, 157, 173, 133, 155, 154, 153, 145, 144, 163, 7]
|
| 566 |
+
|
| 567 |
+
left_eye_x = [int(np.clip(face_landmarks[idx].x, 0, 1) * w) for idx in left_eye_landmarks]
|
| 568 |
+
left_eye_y = [int(np.clip(face_landmarks[idx].y, 0, 1) * h) for idx in left_eye_landmarks]
|
| 569 |
+
left_eye_bbox = [(min(left_eye_x), min(left_eye_y)), (max(left_eye_x), max(left_eye_y))]
|
| 570 |
+
left_eye_bbox = convert_bbox_to_square_bbox(left_eye_bbox, max_h, max_w, scale=eye_bbox_scale)
|
| 571 |
+
|
| 572 |
+
right_eye_x = [int(np.clip(face_landmarks[idx].x, 0, 1) * w) for idx in right_eye_landmarks]
|
| 573 |
+
right_eye_y = [int(np.clip(face_landmarks[idx].y, 0, 1) * h) for idx in right_eye_landmarks]
|
| 574 |
+
right_eye_bbox = [(min(right_eye_x), min(right_eye_y)), (max(right_eye_x), max(right_eye_y))]
|
| 575 |
+
right_eye_bbox = convert_bbox_to_square_bbox(right_eye_bbox, max_h, max_w, scale=eye_bbox_scale)
|
| 576 |
+
|
| 577 |
+
eye_bbox = {"left_eye": left_eye_bbox, "right_eye": right_eye_bbox}
|
| 578 |
+
|
| 579 |
+
all_eye_bounding_box.append(eye_bbox)
|
| 580 |
+
|
| 581 |
+
face_contour = np.zeros_like(image)
|
| 582 |
+
for landmark_id, landmark in enumerate(face_landmarks):
|
| 583 |
+
cx, cy = int(landmark.x * w), int(landmark.y * h)
|
| 584 |
+
if cy >= max_h or cx >= max_w: continue
|
| 585 |
+
if cy < 0 or cx < 0: continue
|
| 586 |
+
face_contour[cy, cx] = (255, 255, 255)
|
| 587 |
+
all_face_contour.append(face_contour)
|
| 588 |
+
|
| 589 |
+
if save_vis:
|
| 590 |
+
import cv2
|
| 591 |
+
x_min, y_min, x_max, y_max = mouth_bbox
|
| 592 |
+
cv2.rectangle(image, (x_min, y_min), (x_max, y_max), (0, 0, 255), 2)
|
| 593 |
+
|
| 594 |
+
for eye_key, bbox in eye_bbox.items():
|
| 595 |
+
x_min, y_min, x_max, y_max = bbox
|
| 596 |
+
color = (0, 0, 255)
|
| 597 |
+
cv2.rectangle(image, (x_min, y_min), (x_max, y_max), color, 2)
|
| 598 |
+
|
| 599 |
+
for landmark_id, landmark in enumerate(face_landmarks):
|
| 600 |
+
cx, cy = int(landmark.x * w), int(landmark.y * h)
|
| 601 |
+
circle_size = 2
|
| 602 |
+
if landmark_id in mouth_landmarks:
|
| 603 |
+
cv2.circle(image, (cx, cy), circle_size, (0, 0, 255), -1)
|
| 604 |
+
elif landmark_id in left_eye_landmarks+right_eye_landmarks:
|
| 605 |
+
cv2.circle(image, (cx, cy), circle_size, (0, 255, 0), -1)
|
| 606 |
+
else:
|
| 607 |
+
cv2.circle(image, (cx, cy), circle_size, (255, 255, 255), -1)
|
| 608 |
+
cv2.imwrite('image_detect.png', image[:,:,::-1])
|
| 609 |
+
# import pdb; pdb.set_trace()
|
| 610 |
+
|
| 611 |
+
return (
|
| 612 |
+
all_x,
|
| 613 |
+
all_y,
|
| 614 |
+
all_orientation,
|
| 615 |
+
num_faces,
|
| 616 |
+
all_keypoints,
|
| 617 |
+
all_bounding_box,
|
| 618 |
+
all_mouth_bounding_box,
|
| 619 |
+
all_eye_bounding_box,
|
| 620 |
+
all_face_contour,
|
| 621 |
+
all_blendshapes,
|
| 622 |
+
all_facial_transformation_matrices,
|
| 623 |
+
annotated_image,
|
| 624 |
+
)
|
tools/visualization_0416/utils/face_landmarker.task
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:64184e229b263107bc2b804c6625db1341ff2bb731874b0bcc2fe6544e0bc9ff
|
| 3 |
+
size 3758596
|