robinwitch commited on
Commit
872b1a7
·
1 Parent(s): 1631026

upload ckpt

Browse files
checkpoints/last.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:15baf05196834ca54b4da7c2c9fd372b572e650b1edd84cb5a92c4be1689f29b
3
+ size 7730126719
tools/pretrained_model/epoch=0-step=312000.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9f06f55912dfa12ea18d77315f1c1675c5f67669097db8f155b4a4d75f9d31d
3
+ size 1579672001
tools/visualization_0416/configs/audio_head_animator.yaml ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 此配置文件主要用于 img_to_mask.py 获取 face detection 相关参数
2
+ debug: false
3
+ seed: 39
4
+ root_name: audio_head_animator
5
+ exp_name: ${root_name}/inference
6
+ mode: train
7
+ n_epochs: null
8
+ cache_dir: cache
9
+ ckpt_dir: ${exp_name}/ckpt
10
+ resume_ckpt: null
11
+
12
+ only_resume_state_dict: False
13
+ pretrained_ckpt: null
14
+
15
+ model:
16
+ module_name: model.head_animation.head_animator
17
+ class_name: HeadAnimatorModule
18
+ pretrained_ckpt: ${pretrained_ckpt}
19
+ using_hybrid_mask: True
20
+ output_dir: ${exp_name}
21
+
22
+ face_encoder:
23
+ module_name: model.head_animation.LIA_3d.face_encoder
24
+ class_name: FaceEncoder
25
+ image_size: 512
26
+ image_channel: 3
27
+ block_expansion: 64
28
+ num_down_blocks: 3
29
+ max_features: 512
30
+ reshape_channel: 32
31
+ reshape_depth: 16
32
+ num_resblocks: 6
33
+
34
+ motion_encoder:
35
+ module_name: model.head_animation.LIA_3d.motion_encoder
36
+ class_name: MotionEncoder
37
+ latent_dim: 512
38
+ size: ${model.face_encoder.image_size}
39
+
40
+ flow_estimator:
41
+ module_name: model.head_animation.LIA_3d.flow_estimator
42
+ class_name: FlowEstimator
43
+ latent_dim: ${model.motion_encoder.latent_dim}
44
+ motion_space: 64
45
+
46
+ face_generator:
47
+ module_name: model.head_animation.LIA_3d.face_generator
48
+ class_name: FaceGenerator
49
+ size: ${model.face_encoder.image_size}
50
+ latent_dim: ${model.motion_encoder.latent_dim}
51
+ outputsize: ${data.train_width}
52
+ reshape_channel: ${model.face_encoder.reshape_channel}
53
+ group_norm_channel: 32
54
+ flag_estimate_occlusion_map: True
55
+
56
+ discriminator:
57
+ module_name: model.head_animation.LIA.discriminator
58
+ class_name: Discriminator
59
+ size: ${data.train_width}
60
+
61
+ vgg_loss:
62
+ module_name: model.head_animation.VASA1.loss
63
+ class_name: VGGLoss
64
+
65
+ loss:
66
+ l_w_recon: 1
67
+ l_w_face_l1: 0
68
+ l_w_vgg: 2
69
+ l_w_gan: 0.2
70
+ l_w_face: 0
71
+ l_w_headpose: 0
72
+ l_w_gaze: 0
73
+ l_w_foreground: 0
74
+ l_w_local: 0
75
+
76
+ optimizer:
77
+ lr: 0.0001
78
+ discriminator_lr: 0.002
79
+ warmup_steps: 0
80
+ adam_beta1: 0.9
81
+ adam_beta2: 0.999
82
+ adam_epsilon: 1.0e-08
83
+ weight_decay: 0.0
84
+ g_reg_every: 4
85
+ d_reg_every: 16
86
+
87
+ logger:
88
+ neptune_project: null
89
+ neptune_api_token: null
90
+ wandb:
91
+ enabled: false
92
+ entity: null
93
+ project: "real-time"
94
+
95
+ callbacks:
96
+ - module_name: lightning.pytorch.callbacks
97
+ class_name: ModelCheckpoint
98
+ dirpath: ${ckpt_dir}
99
+ every_n_train_steps: 2000
100
+ save_top_k: -1
101
+
102
+ trainer:
103
+ accelerator: gpu
104
+ log_every_n_steps: 1
105
+ val_check_interval: 100000
106
+
107
+ data:
108
+ debug: False
109
+ train_bs: 12
110
+ accumulate_grad_batches: 1
111
+ n_sample_frames: 1
112
+ past_n: 1
113
+ num_workers: 8
114
+ ref_sample_margin: 10
115
+ train_width: 512
116
+ train_height: 512
117
+ union_bbox_scale: [1.2, 1.4]
118
+ mouth_bbox_scale: 1.5
119
+ eye_bbox_scale: 2.0
120
+ hybrid_face_mask: ${model.using_hybrid_mask}
121
+ flip_aug: True
122
+ filter_hand_videos: true
123
+ random_sample: False
124
+ dataset_file_path: []
125
+ cache_file_path: []
126
+ train_fps: 25
127
+ dataloader: FastVideoDatasetV2
128
+
129
+ val_data:
130
+ train_bs: 1
131
+ n_sample_frames: 40
132
+ past_n: 2
133
+ num_workers: 6
134
+ ref_sample_margin: ${data.ref_sample_margin}
135
+ train_width: ${data.train_width}
136
+ train_height: ${data.train_height}
137
+ union_bbox_scale: [1.2, 1.4]
138
+ mouth_bbox_scale: ${data.mouth_bbox_scale}
139
+ eye_bbox_scale: ${data.eye_bbox_scale}
140
+ hybrid_face_mask: ${data.hybrid_face_mask}
141
+ flip_aug: False
142
+ filter_hand_videos: ${data.filter_hand_videos}
143
+ random_sample: False
144
+ dataset_file_path: []
145
+ train_fps: ${data.train_fps}
146
+ dataloader: ${data.dataloader}
147
+
148
+ test_data:
149
+ height: 384
150
+ width: 672
151
+ image_paths_and_scales: []
152
+
153
+ inference:
154
+ output_dir: inference_outputs/${exp_name}
tools/visualization_0416/configs/head_animator_best_0506.yaml ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ debug: false
2
+ seed: 39
3
+ root_name: head_animator_LIA3D
4
+ exp_name: ${root_name}/inference
5
+ mode: train
6
+ n_epochs: null
7
+ cache_dir: cache
8
+ ckpt_dir: ${exp_name}/ckpt
9
+ resume_ckpt: ../pretrained_model/epoch=0-step=312000.ckpt
10
+
11
+ only_resume_state_dict: False
12
+ pretrained_ckpt: null
13
+
14
+ model:
15
+ module_name: model.head_animation.head_animator
16
+ class_name: HeadAnimatorModule
17
+ pretrained_ckpt: ${pretrained_ckpt}
18
+ using_hybrid_mask: True
19
+ output_dir: ${exp_name}
20
+
21
+ face_encoder:
22
+ module_name: model.head_animation.LIA_3d.face_encoder
23
+ class_name: FaceEncoder
24
+ image_size: 512
25
+ image_channel: 3
26
+ block_expansion: 64
27
+ num_down_blocks: 3
28
+ max_features: 512
29
+ reshape_channel: 32
30
+ reshape_depth: 16
31
+ num_resblocks: 6
32
+
33
+ motion_encoder:
34
+ module_name: model.head_animation.LIA_3d.motion_encoder
35
+ class_name: MotionEncoder
36
+ latent_dim: 512
37
+ size: ${model.face_encoder.image_size}
38
+
39
+ flow_estimator:
40
+ module_name: model.head_animation.LIA_3d.flow_estimator
41
+ class_name: FlowEstimator
42
+ latent_dim: ${model.motion_encoder.latent_dim}
43
+ motion_space: 64
44
+
45
+ face_generator:
46
+ module_name: model.head_animation.LIA_3d.face_generator
47
+ class_name: FaceGenerator
48
+ size: ${model.face_encoder.image_size}
49
+ latent_dim: ${model.motion_encoder.latent_dim}
50
+ outputsize: ${data.train_width}
51
+ reshape_channel: ${model.face_encoder.reshape_channel}
52
+ group_norm_channel: 32
53
+ flag_estimate_occlusion_map: True
54
+
55
+ discriminator:
56
+ module_name: model.head_animation.LIA.discriminator
57
+ class_name: Discriminator
58
+ size: ${data.train_width}
59
+
60
+ vgg_loss:
61
+ module_name: model.head_animation.VASA1.loss
62
+ class_name: VGGLoss
63
+
64
+ loss:
65
+ l_w_recon: 1
66
+ l_w_face_l1: 0
67
+ l_w_vgg: 2
68
+ l_w_gan: 0.2
69
+ l_w_face: 0
70
+ l_w_headpose: 0
71
+ l_w_gaze: 0
72
+ l_w_foreground: 0
73
+ l_w_local: 0
74
+
75
+ optimizer:
76
+ lr: 0.0001
77
+ discriminator_lr: 0.002
78
+ warmup_steps: 0
79
+ adam_beta1: 0.9
80
+ adam_beta2: 0.999
81
+ adam_epsilon: 1.0e-08
82
+ weight_decay: 0.0
83
+ g_reg_every: 4
84
+ d_reg_every: 16
85
+
86
+ logger:
87
+ neptune_project: null
88
+ neptune_api_token: null
89
+ wandb:
90
+ enabled: false
91
+ entity: null
92
+ project: "real-time"
93
+
94
+ callbacks:
95
+ - module_name: lightning.pytorch.callbacks
96
+ class_name: ModelCheckpoint
97
+ dirpath: ${ckpt_dir}
98
+ every_n_train_steps: 2000
99
+ save_top_k: -1
100
+
101
+ trainer:
102
+ accelerator: gpu
103
+ log_every_n_steps: 1
104
+ val_check_interval: 100000
105
+
106
+ data:
107
+ debug: False
108
+ train_bs: 12
109
+ accumulate_grad_batches: 1
110
+ n_sample_frames: 1
111
+ past_n: 1
112
+ num_workers: 8
113
+ ref_sample_margin: 10
114
+ train_width: 512
115
+ train_height: 512
116
+ union_bbox_scale: [1.2, 1.4]
117
+ mouth_bbox_scale: 1.5
118
+ eye_bbox_scale: 2.0
119
+ hybrid_face_mask: ${model.using_hybrid_mask}
120
+ flip_aug: True
121
+ filter_hand_videos: true
122
+ random_sample: False
123
+ dataset_file_path: []
124
+ cache_file_path: []
125
+ train_fps: 25
126
+ dataloader: FastVideoDatasetV2
127
+
128
+ val_data:
129
+ train_bs: 1
130
+ n_sample_frames: 40
131
+ past_n: 2
132
+ num_workers: 6
133
+ ref_sample_margin: ${data.ref_sample_margin}
134
+ train_width: ${data.train_width}
135
+ train_height: ${data.train_height}
136
+ union_bbox_scale: [1.2, 1.4]
137
+ mouth_bbox_scale: ${data.mouth_bbox_scale}
138
+ eye_bbox_scale: ${data.eye_bbox_scale}
139
+ hybrid_face_mask: ${data.hybrid_face_mask}
140
+ flip_aug: False
141
+ filter_hand_videos: ${data.filter_hand_videos}
142
+ random_sample: False
143
+ dataset_file_path: []
144
+ train_fps: ${data.train_fps}
145
+ dataloader: ${data.dataloader}
146
+
147
+ test_data:
148
+ height: 384
149
+ width: 672
150
+ image_paths_and_scales: []
151
+
152
+ inference:
153
+ output_dir: inference_outputs/${exp_name}
tools/visualization_0416/img_to_latent.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+
4
+ # 获取项目根目录并添加到 sys.path 最前面,确保导入正确的 utils 模块
5
+ _SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
6
+ _PROJECT_ROOT = os.path.abspath(os.path.join(_SCRIPT_DIR, '..', '..'))
7
+ if _PROJECT_ROOT not in sys.path:
8
+ sys.path.insert(0, _PROJECT_ROOT)
9
+
10
+ import numpy as np
11
+ import torch
12
+ from PIL import Image
13
+ import torchvision.transforms as T
14
+ from omegaconf import OmegaConf
15
+ import fire
16
+
17
+ def init_fn(config_path):
18
+ from utils import instantiate
19
+ transform = T.Compose([T.Resize((512, 512)), T.ToTensor(), T.Normalize([0.5], [0.5])])
20
+ config = OmegaConf.load(config_path)
21
+ module = instantiate(config.model, instantiate_module=False)
22
+ model = module(config=config)
23
+ checkpoint = torch.load(config.resume_ckpt, map_location="cpu")
24
+ model.load_state_dict(checkpoint["state_dict"], strict=False)
25
+ model.eval()
26
+ motion_encoder = model.motion_encoder
27
+ return {"transform": transform, "motion_encoder": motion_encoder}
28
+
29
+ def extract_motion_latent(
30
+ mask_image_path='./test_case/test_img_masked.png',
31
+ config_path='./configs/head_animator_best_0506.yaml',
32
+ save_npz_path='./test_case/test_img_resize.npz',
33
+ version="0506"):
34
+ sys.path.insert(0, f'./utils/model_{version}')
35
+ config_path = config_path.replace("0506", version)
36
+ context = init_fn(config_path)
37
+ transform = context["transform"]
38
+ motion_encoder = context["motion_encoder"]
39
+ img = Image.open(mask_image_path).convert("RGB")
40
+ img_tensor = transform(img).unsqueeze(0)
41
+ with torch.no_grad():
42
+ latent = motion_encoder(img_tensor)[0] # [1, 512]
43
+ latent_np = latent.numpy()
44
+
45
+ # 如果文件已存在,先加载原有数据
46
+ if os.path.exists(save_npz_path):
47
+ existing_data = np.load(save_npz_path, allow_pickle=True)
48
+ data_dict = dict(existing_data)
49
+ existing_data.close() # 关闭文件
50
+ else:
51
+ data_dict = {}
52
+
53
+ # 更新或添加新的键值对
54
+ data_dict.update({
55
+ 'video_id': os.path.basename(save_npz_path)[:-4],
56
+ 'mask_img_path': mask_image_path,
57
+ 'ref_img_path': save_npz_path.replace('npz', 'png'),
58
+ 'motion_latent': latent_np
59
+ })
60
+
61
+ # 保存更新后的数据
62
+ np.savez(save_npz_path, **data_dict)
63
+ # np.savez(
64
+ # save_npz_path,
65
+ # video_id=os.path.basename(save_npz_path)[:-4],
66
+ # mask_img_path=mask_image_path,
67
+ # ref_img_path=save_npz_path.replace('npz', 'png'),
68
+ # motion_latent=latent_np
69
+ # )
70
+ if __name__ == '__main__':
71
+ fire.Fire(extract_motion_latent)
tools/visualization_0416/img_to_mask.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ input: image_path
3
+ output: save a masked image and resized image
4
+ """
5
+ import os
6
+ import sys
7
+ import urllib.request
8
+ import numpy as np
9
+ import torch
10
+ import cv2
11
+ from PIL import Image
12
+ from omegaconf import OmegaConf
13
+ from torchvision import transforms
14
+ from utils.face_detector import FaceDetector
15
+ from pathlib import Path
16
+
17
+ def generate_crop_bounding_box(h, w, center, size=512):
18
+ """
19
+ Crop a region of a specified size from the given center point,
20
+ filling the area outside the image boundary with zeros.
21
+
22
+ :param image: The input image in NumPy array form, shape (H, W, C)
23
+ :param center: The center point (y, x) to start cropping from
24
+ :param size: The size of the cropped region (default is 512)
25
+ :return: The cropped region with padding, shape (size, size, C)
26
+ """
27
+ half_size = size // 2 # Half the size for the cropping region
28
+
29
+ # Calculate the top-left and bottom-right coordinates of the cropping region
30
+ y1 = max(center[0] - half_size, 0) # Ensure the y1 index is not less than 0
31
+ x1 = max(center[1] - half_size, 0) # Ensure the x1 index is not less than 0
32
+ y2 = min(center[0] + half_size, h) # Ensure the y2 index does not exceed the image height
33
+ x2 = min(center[1] + half_size, w) # Ensure the x2 index does not exceed the image width
34
+ return [x1, y1, x2, y2]
35
+
36
+ def crop_from_bbox(image, center, bbox, size=512):
37
+ """
38
+ Crop a region of a specified size from the given center point,
39
+ filling the area outside the image boundary with zeros.
40
+
41
+ :param image: The input image in NumPy array form, shape (H, W, C)
42
+ :param center: The center point (y, x) to start cropping from
43
+ :param size: The size of the cropped region (default is 512)
44
+ :return: The cropped region with padding, shape (size, size, C)
45
+ """
46
+ h, w = image.shape[:2] # Get the height and width of the image
47
+ x1, y1, x2, y2 = bbox
48
+ half_size = size // 2 # Half the size for the cropping region
49
+ # Create a zero-filled array for padding
50
+ cropped = np.zeros((size, size, image.shape[2]), dtype=image.dtype)
51
+
52
+ # Copy the valid region from the original image to the cropped region
53
+ cropped[(y1 - (center[0] - half_size)):(y2 - (center[0] - half_size)),
54
+ (x1 - (center[1] - half_size)):(x2 - (center[1] - half_size))] = image[y1:y2, x1:x2]
55
+
56
+ return cropped
57
+
58
+ face_detector = None
59
+ model_path = "./utils/face_landmarker.task"
60
+ if not os.path.exists(model_path):
61
+ print("Downloading face landmarker model...")
62
+ url = "https://storage.googleapis.com/mediapipe-models/face_landmarker/face_landmarker/float16/1/face_landmarker.task"
63
+ urllib.request.urlretrieve(url, model_path)
64
+
65
+ def initialize_face_detector():
66
+ global face_detector
67
+ if face_detector is None:
68
+ face_detector = FaceDetector(
69
+ mediapipe_model_asset_path=model_path,
70
+ face_detection_confidence=0.5,
71
+ num_faces=1,
72
+ )
73
+ initialize_face_detector()
74
+
75
+ def augmentation(images, transform, state=None):
76
+ if state is not None:
77
+ torch.set_rng_state(state)
78
+ if isinstance(images, list):
79
+ transformed = [transforms.functional.to_tensor(img) for img in images]
80
+ return transform(torch.stack(transformed, dim=0))
81
+ return transform(transforms.functional.to_tensor(images))
82
+
83
+ def scale_bbox(bbox, h, w, scale=1.8):
84
+ sw = (bbox[2] - bbox[0]) / 2
85
+ sh = (bbox[3] - bbox[1]) / 2
86
+ cx = (bbox[0] + bbox[2]) / 2
87
+ cy = (bbox[1] + bbox[3]) / 2
88
+ sw *= scale
89
+ sh *= scale
90
+ scaled = [cx - sw, cy - sh, cx + sw, cy + sh]
91
+ scaled[0] = np.clip(scaled[0], 0, w)
92
+ scaled[2] = np.clip(scaled[2], 0, w)
93
+ scaled[1] = np.clip(scaled[1], 0, h)
94
+ scaled[3] = np.clip(scaled[3], 0, h)
95
+ return scaled
96
+
97
+ def get_mask(bbox, hd, wd, scale=1.0, return_pil=True):
98
+ if min(bbox) < 0:
99
+ raise Exception("Invalid mask")
100
+ bbox = scale_bbox(bbox, hd, wd, scale=scale)
101
+ x0, y0, x1, y1 = [int(v) for v in bbox]
102
+ mask = np.zeros((hd, wd, 3), dtype=np.uint8)
103
+ mask[y0:y1, x0:x1, :] = 255
104
+ if return_pil:
105
+ return Image.fromarray(mask)
106
+ return mask
107
+
108
+ def generate_masked_image(
109
+ image_path="./test_case/test_img.png",
110
+ save_path="./test_case/test_img.png",
111
+ crop=False,
112
+ union_bbox_scale=1.3):
113
+ cfg = OmegaConf.load("./configs/audio_head_animator.yaml")
114
+ pixel_transform = transforms.Compose([
115
+ transforms.Resize(512, interpolation=transforms.InterpolationMode.BICUBIC),
116
+ transforms.Normalize([0.5], [0.5]),
117
+ ])
118
+ resize_transform = transforms.Resize((512, 512), interpolation=transforms.InterpolationMode.BICUBIC)
119
+
120
+ img = Image.open(image_path).convert("RGB")
121
+ state = torch.get_rng_state()
122
+
123
+ # Get face detection results first
124
+ det_res = face_detector.get_face_xy_rotation_and_keypoints(
125
+ np.array(img), cfg.data.mouth_bbox_scale, cfg.data.eye_bbox_scale
126
+ )
127
+
128
+ person_id = 0
129
+ mouth_bbox = np.array(det_res[6][person_id])
130
+ eye_bbox = det_res[7][person_id]
131
+ face_contour = np.array(det_res[8][person_id])
132
+ left_eye_bbox = eye_bbox["left_eye"]
133
+ right_eye_bbox = eye_bbox["right_eye"]
134
+
135
+ # If crop is True, crop the face region first
136
+ if crop:
137
+ # Get the face bounding box and calculate center
138
+ face_bbox = det_res[5][person_id] # Get the face bounding box from det_res[5]
139
+ # face_bbox is [(x1, y1), (x2, y2)]
140
+ x1, y1 = face_bbox[0]
141
+ x2, y2 = face_bbox[1]
142
+ center = [(y1 + y2) // 2, (x1 + x2) // 2]
143
+
144
+ # Calculate the size for cropping
145
+ width = x2 - x1
146
+ height = y2 - y1
147
+ max_size = int(max(width, height) * union_bbox_scale)
148
+
149
+ # Get the image dimensions
150
+ hd, wd = img.size[1], img.size[0]
151
+
152
+ # Generate the crop bounding box
153
+ crop_bbox = generate_crop_bounding_box(hd, wd, center, max_size)
154
+
155
+ # Crop the image
156
+ img_array = np.array(img)
157
+ cropped_img = crop_from_bbox(img_array, center, crop_bbox, size=max_size)
158
+ img = Image.fromarray(cropped_img)
159
+
160
+ # Update the face detection results for the cropped image
161
+ det_res = face_detector.get_face_xy_rotation_and_keypoints(
162
+ cropped_img, cfg.data.mouth_bbox_scale, cfg.data.eye_bbox_scale
163
+ )
164
+ mouth_bbox = np.array(det_res[6][person_id])
165
+ eye_bbox = det_res[7][person_id]
166
+ face_contour = np.array(det_res[8][person_id])
167
+ left_eye_bbox = eye_bbox["left_eye"]
168
+ right_eye_bbox = eye_bbox["right_eye"]
169
+
170
+ pixel_values_ref = augmentation([img], pixel_transform, state)
171
+ pixel_values_ref = (pixel_values_ref + 1) / 2
172
+ new_hd, new_wd = img.size[1], img.size[0]
173
+
174
+ mouth_mask = resize_transform(get_mask(mouth_bbox, new_hd, new_wd, scale=1.0))
175
+ left_eye_mask = resize_transform(get_mask(left_eye_bbox, new_hd, new_wd, scale=1.0))
176
+ right_eye_mask = resize_transform(get_mask(right_eye_bbox, new_hd, new_wd, scale=1.0))
177
+ face_contour = resize_transform(Image.fromarray(face_contour))
178
+
179
+ eye_mask = np.bitwise_or(np.array(left_eye_mask), np.array(right_eye_mask))
180
+ combined_mask = np.bitwise_or(eye_mask, np.array(mouth_mask))
181
+
182
+ combined_mask_tensor = torch.from_numpy(combined_mask / 255.0).permute(2, 0, 1).unsqueeze(0)
183
+ face_contour_tensor = torch.from_numpy(np.array(face_contour) / 255.0).permute(2, 0, 1).unsqueeze(0)
184
+
185
+ masked_ref = pixel_values_ref * combined_mask_tensor + face_contour_tensor * (1 - combined_mask_tensor)
186
+ masked_ref = masked_ref.clamp(0, 1)
187
+ masked_ref_np = (masked_ref.squeeze(0).permute(1, 2, 0).cpu().numpy() * 255).astype(np.uint8)
188
+
189
+ base, _ = os.path.splitext(save_path)
190
+ resized_img = (pixel_values_ref.squeeze(0).permute(1, 2, 0).cpu().numpy().clip(0, 1) * 255).astype(np.uint8)
191
+ Image.fromarray(resized_img).save(f"{base}_resize.png")
192
+ Image.fromarray(masked_ref_np).save(f"{base}_masked.png")
193
+
194
+ if __name__ == '__main__':
195
+ import fire
196
+ fire.Fire(generate_masked_image)
197
+ # python img_to_mask.py --image_path /mnt/weka/haiyang_workspace/ckpts/good_train_case/image_example/KristiNoem2-Scene-001.png --save_path /mnt/weka/haiyang_workspace/ckpts/good_train_case/image_example/KristiNoem2-Scene-001.png --crop True --union_bbox_scale 1.6
198
+ # python img_to_latent.py --mask_image_path ./test_case/ChrisVanHollen0-Scene-003_masked.png --save_npz_path ./test_case/ChrisVanHollen0-Scene-003_resize.npz
199
+ # python latent_two_video.py --npz_path ./test_case/ChrisVanHollen0-Scene-003_resize.npz --save_dir ./test_case/
tools/visualization_0416/latent_to_video.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+
4
+ # 获取项目根目录并添加到 sys.path 最前面,确保导入正确的 utils 模块
5
+ _SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
6
+ _PROJECT_ROOT = os.path.abspath(os.path.join(_SCRIPT_DIR, '..', '..'))
7
+ if _PROJECT_ROOT not in sys.path:
8
+ sys.path.insert(0, _PROJECT_ROOT)
9
+
10
+ import numpy as np
11
+ import torch
12
+ from PIL import Image
13
+ import torchvision.transforms as T
14
+ from omegaconf import OmegaConf
15
+ import fire
16
+ import imageio
17
+ import moviepy.editor as mp
18
+ from tqdm import tqdm
19
+
20
+ def init_fn(config_path, version):
21
+ sys.path.insert(0, f'./utils/model_{version}')
22
+ from utils import instantiate
23
+ config = OmegaConf.load(config_path)
24
+ module = instantiate(config.model, instantiate_module=False)
25
+ model = module(config=config)
26
+ checkpoint = torch.load(config.resume_ckpt, map_location="cpu")
27
+ model.load_state_dict(checkpoint["state_dict"], strict=False)
28
+ model.eval().to("cuda")
29
+ transform = T.Compose([
30
+ T.Resize((512, 512)),
31
+ T.ToTensor(),
32
+ T.Normalize([0.5], [0.5]),
33
+ ])
34
+ return {
35
+ "transform": transform,
36
+ "flow_estimator": model.flow_estimator,
37
+ "face_generator": model.face_generator,
38
+ "face_encoder": model.face_encoder,
39
+ }
40
+
41
+ def latent_to_video(
42
+ npz_dir="./test_case/",
43
+ save_dir="./test_case/",
44
+ save_fps: int = 25,
45
+ config_path: str = './configs/head_animator_best_0416.yaml',
46
+ version: str = '0416',
47
+ ):
48
+ # 处理相对路径:
49
+ # - npz_dir 和 save_dir:如果是相对路径,转换为基于项目根目录的绝对路径
50
+ # - config_path:如果是相对路径,转换为基于当前脚本目录(tools/visualization_0416/)的绝对路径
51
+ if not os.path.isabs(npz_dir):
52
+ npz_dir = os.path.join(_PROJECT_ROOT, npz_dir)
53
+ if not os.path.isabs(save_dir):
54
+ save_dir = os.path.join(_PROJECT_ROOT, save_dir)
55
+ if not os.path.isabs(config_path):
56
+ config_path = os.path.join(_SCRIPT_DIR, config_path)
57
+
58
+ # 规范化路径(去除多余的 . 和 ..)
59
+ npz_dir = os.path.normpath(npz_dir)
60
+ save_dir = os.path.normpath(save_dir)
61
+ config_path = os.path.normpath(config_path)
62
+
63
+ os.makedirs(save_dir, exist_ok=True)
64
+ # 只在文件名上做版本号替换,避免把路径里的 "0416" 一并替换成 "0506"
65
+ config_dir = os.path.dirname(config_path)
66
+ config_name = os.path.basename(config_path)
67
+ config_name = config_name.replace("0416", version)
68
+ config_path = os.path.join(config_dir, config_name)
69
+
70
+ # Initialize models only once
71
+ print("Initializing models...")
72
+ print(f"NPZ directory: {npz_dir}")
73
+ print(f"Save directory: {save_dir}")
74
+ ctx = init_fn(config_path, version)
75
+ transform = ctx["transform"]
76
+ flow_estimator = ctx["flow_estimator"]
77
+ face_generator = ctx["face_generator"]
78
+ face_encoder = ctx["face_encoder"]
79
+
80
+ # Get all npz files
81
+ if not os.path.exists(npz_dir):
82
+ print(f"Error: NPZ directory does not exist: {npz_dir}")
83
+ return
84
+
85
+ npz_files = [f for f in os.listdir(npz_dir) if f.endswith('_output.npz')]
86
+ print(f"Found {len(npz_files)} files to process")
87
+
88
+ # Process each file
89
+ for npz_file in tqdm(npz_files, desc="Processing files"):
90
+ if not npz_file.endswith('.npz'): continue
91
+ try:
92
+ npz_path = os.path.join(npz_dir, npz_file)
93
+ data = np.load(npz_path, allow_pickle=True)
94
+ motion_latent = torch.from_numpy(data["motion_latent"]).to("cuda").float()
95
+ if len(motion_latent.shape) == 3:
96
+ motion_latent = motion_latent.squeeze(0)
97
+ num_frames = motion_latent.shape[0]
98
+ print(f"\nProcessing {npz_file} with {num_frames} frames")
99
+
100
+ # 处理 ref_img_path - 如果是相对路径,基于项目根目录解析
101
+ ref_img_path = str(data["ref_img_path"])
102
+ if not os.path.isabs(ref_img_path):
103
+ ref_img_path = os.path.join(_PROJECT_ROOT, ref_img_path)
104
+ ref_img = Image.open(ref_img_path).convert("RGB")
105
+ ref_img = transform(ref_img).unsqueeze(0).to("cuda")
106
+ # np.save("/mnt/weka/haiyang_workspace/ckpts/good_train_case/image_example/face_encoder_input.npy", ref_img.cpu().numpy())
107
+
108
+ with torch.no_grad():
109
+ face_feat = face_encoder(ref_img)
110
+ # np.save("/mnt/weka/haiyang_workspace/ckpts/good_train_case/image_example/face_encoder_output.npy", face_feat.cpu().numpy())
111
+ recon_list = []
112
+ for i in range(0, num_frames):
113
+ tgt = flow_estimator(motion_latent[0:1], motion_latent[i:i+1])
114
+ recon_list.append(face_generator(tgt, face_feat))
115
+
116
+ recon = torch.cat(recon_list, dim=0)
117
+ video_np = recon.permute(0, 2, 3, 1).cpu().numpy()
118
+ video_np = np.clip((video_np + 1) / 2 * 255, 0, 255).astype("uint8")
119
+
120
+ video_id = str(data["video_id"])
121
+ # Remove leading dash to prevent FFMPEG command line parsing issues
122
+ if video_id.startswith('-'):
123
+ video_id = video_id[1:]
124
+
125
+ if num_frames == 1:
126
+ out_path = os.path.join(save_dir, f"{video_id}_rec.png")
127
+ Image.fromarray(video_np[0]).save(out_path)
128
+ else:
129
+ temp_mp4 = os.path.join(save_dir, f"{video_id}_temp.mp4")
130
+ final_mp4 = os.path.join(save_dir, f"{video_id}.mp4")
131
+ finalfinal_mp4 = os.path.join(save_dir, f"{str(data['video_id'])}.mp4")
132
+ with imageio.get_writer(temp_mp4, fps=save_fps) as writer:
133
+ for frame in video_np:
134
+ writer.append_data(frame)
135
+ # 处理 audio_path - 如果是相对路径,基于项目根目录解析
136
+ audio_path = str(data["audio_path"]) if "audio_path" in data.files else None
137
+ if audio_path and not os.path.isabs(audio_path):
138
+ audio_path = os.path.join(_PROJECT_ROOT, audio_path)
139
+ if audio_path and os.path.exists(audio_path):
140
+ clip = mp.VideoFileClip(temp_mp4)
141
+ audio = mp.AudioFileClip(audio_path)
142
+ clip.set_audio(audio).write_videofile(final_mp4, codec="libx264", audio_codec="aac")
143
+ clip.close()
144
+ audio.close()
145
+ os.remove(temp_mp4)
146
+ else:
147
+ os.rename(temp_mp4, final_mp4)
148
+ os.rename(final_mp4, finalfinal_mp4)
149
+ except Exception as e:
150
+ print(f"Error processing {npz_file}: {str(e)}")
151
+ continue
152
+
153
+ if __name__ == "__main__":
154
+ fire.Fire(latent_to_video)
155
+ # Example usage:
156
+ # python latent_to_video.py --npz_dir ./test_case/ --save_dir ./test_case/ --config_path ./configs/head_animator_best_0409.yaml --version 0416
tools/visualization_0416/latent_to_video_batch.py ADDED
@@ -0,0 +1,249 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ 批处理优化版本的 latent_to_video
3
+ 相比原版逐帧处理,使用批处理加速约 10-30 倍
4
+ v2: 优化 GPU→CPU 传输和视频编码,使用流式处理
5
+ """
6
+ import sys
7
+ import os
8
+
9
+ # 获取项目根目录并添加到 sys.path 最前面,确保导入正确的 utils 模块
10
+ _SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
11
+ _PROJECT_ROOT = os.path.abspath(os.path.join(_SCRIPT_DIR, '..', '..'))
12
+ if _PROJECT_ROOT not in sys.path:
13
+ sys.path.insert(0, _PROJECT_ROOT)
14
+
15
+ import numpy as np
16
+ import torch
17
+ from PIL import Image
18
+ import torchvision.transforms as T
19
+ from omegaconf import OmegaConf
20
+ import fire
21
+ import imageio
22
+ import moviepy.editor as mp
23
+ from tqdm import tqdm
24
+ import time
25
+ import subprocess
26
+ import tempfile
27
+
28
+
29
+ def init_fn(config_path, version):
30
+ sys.path.insert(0, f'./utils/model_{version}')
31
+ from utils import instantiate
32
+ config = OmegaConf.load(config_path)
33
+ module = instantiate(config.model, instantiate_module=False)
34
+ model = module(config=config)
35
+ checkpoint = torch.load(config.resume_ckpt, map_location="cpu")
36
+ model.load_state_dict(checkpoint["state_dict"], strict=False)
37
+ model.eval().to("cuda")
38
+ transform = T.Compose([
39
+ T.Resize((512, 512)),
40
+ T.ToTensor(),
41
+ T.Normalize([0.5], [0.5]),
42
+ ])
43
+ return {
44
+ "transform": transform,
45
+ "flow_estimator": model.flow_estimator,
46
+ "face_generator": model.face_generator,
47
+ "face_encoder": model.face_encoder,
48
+ }
49
+
50
+
51
+ def latent_to_video_batch(
52
+ npz_dir="./test_case/",
53
+ save_dir="./test_case/",
54
+ save_fps: int = 25,
55
+ config_path: str = './configs/head_animator_best_0416.yaml',
56
+ version: str = '0416',
57
+ batch_size: int = 32,
58
+ use_fp16: bool = True,
59
+ ):
60
+ """
61
+ 批处理优化版本的 latent_to_video
62
+
63
+ Args:
64
+ npz_dir: NPZ 文件目录
65
+ save_dir: 输出视频目录
66
+ save_fps: 输出视频帧率
67
+ config_path: 模型配置文件路径
68
+ version: 模型版本
69
+ batch_size: 批处理大小,根据显存调整 (默认 32,显存不足可降到 16 或 8)
70
+ use_fp16: 是否使用混合精度加速 (默认 True)
71
+ """
72
+ os.makedirs(save_dir, exist_ok=True)
73
+ config_path = config_path.replace("0416", version)
74
+
75
+ # Initialize models only once
76
+ print("Initializing models...")
77
+ ctx = init_fn(config_path, version)
78
+ transform = ctx["transform"]
79
+ flow_estimator = ctx["flow_estimator"]
80
+ face_generator = ctx["face_generator"]
81
+ face_encoder = ctx["face_encoder"]
82
+
83
+ # Get all npz files
84
+ npz_files = [f for f in os.listdir(npz_dir) if f.endswith('_output.npz')]
85
+ print(f"Found {len(npz_files)} files to process")
86
+ print(f"Batch size: {batch_size}, FP16: {use_fp16}")
87
+
88
+ total_frames = 0
89
+ total_time = 0
90
+
91
+ # Process each file
92
+ for npz_file in tqdm(npz_files, desc="Processing files"):
93
+ if not npz_file.endswith('.npz'):
94
+ continue
95
+ try:
96
+ npz_path = os.path.join(npz_dir, npz_file)
97
+ data = np.load(npz_path, allow_pickle=True)
98
+ motion_latent = torch.from_numpy(data["motion_latent"]).to("cuda").float()
99
+ if len(motion_latent.shape) == 3:
100
+ motion_latent = motion_latent.squeeze(0)
101
+ num_frames = motion_latent.shape[0]
102
+ print(f"\nProcessing {npz_file} with {num_frames} frames")
103
+
104
+ # 处理 ref_img_path - 如果是相对路径,基于项目根目录解析
105
+ ref_img_path = str(data["ref_img_path"])
106
+ if not os.path.isabs(ref_img_path):
107
+ ref_img_path = os.path.join(_PROJECT_ROOT, ref_img_path)
108
+ ref_img = Image.open(ref_img_path).convert("RGB")
109
+ ref_img = transform(ref_img).unsqueeze(0).to("cuda")
110
+
111
+ video_id = str(data["video_id"])
112
+ # Remove leading dash to prevent FFMPEG command line parsing issues
113
+ if video_id.startswith('-'):
114
+ video_id = video_id[1:]
115
+
116
+ # 处理 audio_path
117
+ audio_path = str(data["audio_path"]) if "audio_path" in data.files else None
118
+ if audio_path and not os.path.isabs(audio_path):
119
+ audio_path = os.path.join(_PROJECT_ROOT, audio_path)
120
+
121
+ start_time = time.time()
122
+
123
+ # 准备输出路径
124
+ temp_mp4 = os.path.join(save_dir, f"{video_id}_temp.mp4")
125
+ final_mp4 = os.path.join(save_dir, f"{video_id}.mp4")
126
+ finalfinal_mp4 = os.path.join(save_dir, f"{str(data['video_id'])}.mp4")
127
+
128
+ if num_frames == 1:
129
+ # 单帧情况
130
+ with torch.no_grad():
131
+ with torch.cuda.amp.autocast(enabled=use_fp16):
132
+ face_feat = face_encoder(ref_img)
133
+ tgt = flow_estimator(motion_latent[0:1], motion_latent[0:1])
134
+ recon = face_generator(tgt, face_feat)
135
+ if use_fp16:
136
+ recon = recon.float()
137
+
138
+ video_np = recon.permute(0, 2, 3, 1).cpu().numpy()
139
+ video_np = np.clip((video_np + 1) / 2 * 255, 0, 255).astype("uint8")
140
+ out_path = os.path.join(save_dir, f"{video_id}_rec.png")
141
+ Image.fromarray(video_np[0]).save(out_path)
142
+ else:
143
+ # 多帧情况 - 使用 FFmpeg pipe 流式编码
144
+ # 启动 FFmpeg 进程
145
+ ffmpeg_cmd = [
146
+ 'ffmpeg', '-y',
147
+ '-f', 'rawvideo',
148
+ '-vcodec', 'rawvideo',
149
+ '-s', '512x512',
150
+ '-pix_fmt', 'rgb24',
151
+ '-r', str(save_fps),
152
+ '-i', '-',
153
+ '-c:v', 'libx264',
154
+ '-preset', 'fast',
155
+ '-crf', '18',
156
+ '-pix_fmt', 'yuv420p',
157
+ temp_mp4
158
+ ]
159
+
160
+ ffmpeg_process = subprocess.Popen(
161
+ ffmpeg_cmd,
162
+ stdin=subprocess.PIPE,
163
+ stdout=subprocess.DEVNULL,
164
+ stderr=subprocess.DEVNULL
165
+ )
166
+
167
+ with torch.no_grad():
168
+ with torch.cuda.amp.autocast(enabled=use_fp16):
169
+ face_feat = face_encoder(ref_img) # (1, 32, 16, 64, 64)
170
+ ref_latent = motion_latent[0:1] # 参考帧的 latent
171
+
172
+ # 批处理推理 + 流式写入
173
+ for i in range(0, num_frames, batch_size):
174
+ batch_end = min(i + batch_size, num_frames)
175
+ current_batch_size = batch_end - i
176
+
177
+ # 获取当前批次的 motion latent
178
+ batch_motion = motion_latent[i:batch_end]
179
+
180
+ # 扩展参考帧 latent 到批次大小
181
+ ref_latent_expanded = ref_latent.expand(current_batch_size, -1)
182
+
183
+ # 扩展 face_feat 到批次大小
184
+ face_feat_expanded = face_feat.expand(current_batch_size, -1, -1, -1, -1)
185
+
186
+ # 批量计算 flow
187
+ tgt = flow_estimator(ref_latent_expanded, batch_motion)
188
+
189
+ # 批量生成图像
190
+ recon = face_generator(tgt, face_feat_expanded)
191
+
192
+ # 转换并写入 - 直接在 GPU 上做归一化
193
+ # (batch, 3, 512, 512) -> (batch, 512, 512, 3)
194
+ recon = recon.float()
195
+ recon = (recon + 1) / 2 * 255
196
+ recon = recon.clamp(0, 255).to(torch.uint8)
197
+ recon = recon.permute(0, 2, 3, 1).contiguous()
198
+
199
+ # 分块传输到 CPU 并写入
200
+ frames_np = recon.cpu().numpy()
201
+ ffmpeg_process.stdin.write(frames_np.tobytes())
202
+
203
+ # 关闭 FFmpeg
204
+ ffmpeg_process.stdin.close()
205
+ ffmpeg_process.wait()
206
+
207
+ elapsed = time.time() - start_time
208
+ total_frames += num_frames
209
+ total_time += elapsed
210
+ fps = num_frames / elapsed
211
+ print(f" Rendered + encoded {num_frames} frames in {elapsed:.2f}s ({fps:.1f} fps)")
212
+
213
+ # 合并音频
214
+ if audio_path and os.path.exists(audio_path):
215
+ # 使用 FFmpeg 直接合并音频(比 moviepy 快很多)
216
+ final_with_audio = os.path.join(save_dir, f"{video_id}_with_audio.mp4")
217
+ ffmpeg_audio_cmd = [
218
+ 'ffmpeg', '-y',
219
+ '-i', temp_mp4,
220
+ '-i', audio_path,
221
+ '-c:v', 'copy',
222
+ '-c:a', 'aac',
223
+ '-shortest',
224
+ final_with_audio
225
+ ]
226
+ subprocess.run(ffmpeg_audio_cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
227
+ os.remove(temp_mp4)
228
+ os.rename(final_with_audio, finalfinal_mp4)
229
+ else:
230
+ os.rename(temp_mp4, finalfinal_mp4)
231
+
232
+ except Exception as e:
233
+ import traceback
234
+ print(f"Error processing {npz_file}: {str(e)}")
235
+ traceback.print_exc()
236
+ continue
237
+
238
+ # 打印总体统计
239
+ if total_time > 0:
240
+ print(f"\n{'='*50}")
241
+ print(f"总计: {total_frames} 帧, {total_time:.2f} ��")
242
+ print(f"平均渲染速度: {total_frames / total_time:.1f} fps")
243
+ print(f"{'='*50}")
244
+
245
+
246
+ if __name__ == "__main__":
247
+ fire.Fire(latent_to_video_batch)
248
+ # Example usage:
249
+ # python latent_to_video_batch.py --npz_dir ./test_case/ --save_dir ./test_case/ --batch_size 32 --use_fp16 True
tools/visualization_0416/utils/__init__.py ADDED
File without changes
tools/visualization_0416/utils/face_detector.py ADDED
@@ -0,0 +1,624 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import mediapipe as mp
2
+ from mediapipe import solutions
3
+ from mediapipe.framework.formats import landmark_pb2
4
+ import numpy as np
5
+ import cv2
6
+
7
+
8
+ def convert_bbox_to_square_bbox(bbox, max_h, max_w, scale=1.0):
9
+ # Calculate width, height, and max_size of the bounding box
10
+ width = bbox[1][0] - bbox[0][0]
11
+ height = bbox[1][1] - bbox[0][1]
12
+ max_size = max(width, height) * scale
13
+
14
+ # Calculate center of the bounding box
15
+ center_x = (bbox[0][0] + bbox[1][0]) / 2
16
+ center_y = (bbox[0][1] + bbox[1][1]) / 2
17
+
18
+ # Calculate the left-up and right-bottom corners of the square bounding box
19
+ half_size = max_size / 2
20
+ left_top = [int(center_x - half_size), int(center_y - half_size)]
21
+ right_bottom = [int(center_x + half_size), int(center_y + half_size)]
22
+
23
+ # Ensure the square is within image bounds
24
+ left_top[0] = max(0, left_top[0])
25
+ left_top[1] = max(0, left_top[1])
26
+ right_bottom[0] = min(max_w, right_bottom[0])
27
+ right_bottom[1] = min(max_h, right_bottom[1])
28
+
29
+ # Return the new bounding box as a list of top-left and bottom-right coordinates
30
+ return [left_top[0], left_top[1], right_bottom[0], right_bottom[1]]
31
+
32
+
33
+ def draw_landmarks_on_image(rgb_image, detection_result):
34
+ face_landmarks_list = detection_result.face_landmarks
35
+ annotated_image = np.copy(rgb_image)
36
+
37
+ # Loop through the detected faces to visualize.
38
+ for idx in range(len(face_landmarks_list)):
39
+ face_landmarks = face_landmarks_list[idx]
40
+
41
+ # Draw the face landmarks.
42
+ face_landmarks_proto = landmark_pb2.NormalizedLandmarkList()
43
+ face_landmarks_proto.landmark.extend(
44
+ [
45
+ landmark_pb2.NormalizedLandmark(
46
+ x=landmark.x, y=landmark.y, z=landmark.z
47
+ )
48
+ for landmark in face_landmarks
49
+ ]
50
+ )
51
+
52
+ solutions.drawing_utils.draw_landmarks(
53
+ image=annotated_image,
54
+ landmark_list=face_landmarks_proto,
55
+ connections=mp.solutions.face_mesh.FACEMESH_TESSELATION,
56
+ landmark_drawing_spec=None,
57
+ connection_drawing_spec=mp.solutions.drawing_styles.get_default_face_mesh_tesselation_style(),
58
+ )
59
+ solutions.drawing_utils.draw_landmarks(
60
+ image=annotated_image,
61
+ landmark_list=face_landmarks_proto,
62
+ connections=mp.solutions.face_mesh.FACEMESH_CONTOURS,
63
+ landmark_drawing_spec=None,
64
+ connection_drawing_spec=mp.solutions.drawing_styles.get_default_face_mesh_contours_style(),
65
+ )
66
+ solutions.drawing_utils.draw_landmarks(
67
+ image=annotated_image,
68
+ landmark_list=face_landmarks_proto,
69
+ connections=mp.solutions.face_mesh.FACEMESH_IRISES,
70
+ landmark_drawing_spec=None,
71
+ connection_drawing_spec=mp.solutions.drawing_styles.get_default_face_mesh_iris_connections_style(),
72
+ )
73
+
74
+ return annotated_image
75
+
76
+
77
+ class FaceDetector:
78
+ def __init__(self, mediapipe_model_asset_path, delegate=1, face_detection_confidence=0.5, num_faces=5):
79
+ # Create a face landmarker instance with the video mode:
80
+ options = mp.tasks.vision.FaceLandmarkerOptions(
81
+ base_options=mp.tasks.BaseOptions(
82
+ model_asset_path=mediapipe_model_asset_path,
83
+ # delegate=mp.tasks.BaseOptions.Delegate.GPU,
84
+ # TODO: why does the gpu version not work in docker???
85
+ delegate=delegate,
86
+ ),
87
+ running_mode=mp.tasks.vision.RunningMode.IMAGE,
88
+ num_faces=num_faces,
89
+ output_face_blendshapes=True,
90
+ output_facial_transformation_matrixes=True,
91
+ min_face_detection_confidence=face_detection_confidence,
92
+ min_face_presence_confidence=face_detection_confidence,
93
+ min_tracking_confidence=face_detection_confidence,
94
+ )
95
+ self.detector = mp.tasks.vision.FaceLandmarker.create_from_options(options)
96
+
97
+ def get_one_face_xy_rotation_and_keypoints(self, image, mouth_bbox_scale = 1.2, eye_bbox_scale = 1.5, annotate_image: bool = False, save_vis=False):
98
+ mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=image)
99
+
100
+ # get facial rotation
101
+ results = self.detector.detect(mp_image)
102
+ max_h, max_w = image.shape[:2]
103
+
104
+ if annotate_image:
105
+ annotated_image = draw_landmarks_on_image(image, results)
106
+ else:
107
+ annotated_image = None
108
+
109
+ all_x = []
110
+ all_y = []
111
+ all_orientation = []
112
+ all_keypoints = []
113
+ all_bounding_box = []
114
+ all_mouth_bounding_box = []
115
+ all_eye_bounding_box = []
116
+ all_face_contour = []
117
+ all_eyeball = []
118
+ all_eyeball_mask = []
119
+ all_blendshapes = []
120
+ all_mouth_p = []
121
+ all_nose_p = []
122
+ all_left_eye_p = []
123
+ all_right_eye_p = []
124
+ num_faces = len(results.face_landmarks)
125
+
126
+ for face_blendshapes in results.face_blendshapes:
127
+ blendshapes = [item.score for item in face_blendshapes]
128
+ all_blendshapes.append(blendshapes)
129
+
130
+ all_facial_transformation_matrices = results.facial_transformation_matrixes
131
+
132
+ for face_landmarks in results.face_landmarks:
133
+ keypoints = []
134
+ bounding_box = []
135
+
136
+ h, w = image.shape[0], image.shape[1]
137
+ cx_min, cy_min = w, h
138
+ cx_max, cy_max = 0, 0
139
+ for idx, lm in enumerate(face_landmarks):
140
+ # Clip landmarks if they go off the image
141
+ cx, cy = int(np.clip(lm.x, 0, 1) * w), int(np.clip(lm.y, 0, 1) * h)
142
+
143
+ if cx < cx_min:
144
+ cx_min = cx
145
+ if cy < cy_min:
146
+ cy_min = cy
147
+ if cx > cx_max:
148
+ cx_max = cx
149
+ if cy > cy_max:
150
+ cy_max = cy
151
+
152
+ keypoints.append((lm.x, lm.y, lm.z))
153
+
154
+ if idx == 137:
155
+ right_cheek = (lm.x, lm.y, lm.z)
156
+ if idx == 366:
157
+ left_cheek = (lm.x, lm.y, lm.z)
158
+ if idx == 4:
159
+ nose = (lm.x, lm.y, lm.z)
160
+
161
+ # get vector from middle of face to tip of nose
162
+ face_middle = (
163
+ (right_cheek[0] + left_cheek[0]) / 2.0,
164
+ (right_cheek[1] + left_cheek[1]) / 2.0,
165
+ )
166
+
167
+ x = nose[0] - face_middle[0]
168
+ y = nose[1] - face_middle[1]
169
+
170
+ if x > 0.15:
171
+ orientation = "left"
172
+ elif x < -0.15:
173
+ orientation = "right"
174
+ else:
175
+ orientation = "forward"
176
+
177
+ bounding_box = [(cx_min, cy_min), (cx_max, cy_max)]
178
+
179
+ all_keypoints.append(keypoints)
180
+ all_bounding_box.append(bounding_box)
181
+ all_x.append(x)
182
+ all_y.append(y)
183
+ all_orientation.append(orientation)
184
+
185
+ # Get mouth bounding box (landmarks 13-17 and 308-312)
186
+ mouth_landmarks = [
187
+ 61,
188
+ 146,
189
+ 146,
190
+ 91,
191
+ 91,
192
+ 181,
193
+ 181,
194
+ 84,
195
+ 84,
196
+ 17,
197
+ 17,
198
+ 314,
199
+ 314,
200
+ 405,
201
+ 405,
202
+ 321,
203
+ 321,
204
+ 375,
205
+ 375,
206
+ 291,
207
+ 61,
208
+ 185,
209
+ 185,
210
+ 40,
211
+ 40,
212
+ 39,
213
+ 39,
214
+ 37,
215
+ 37,
216
+ 0,
217
+ 0,
218
+ 267,
219
+ 267,
220
+ 269,
221
+ 269,
222
+ 270,
223
+ 270,
224
+ 409,
225
+ 409,
226
+ 291,
227
+ 78,
228
+ 95,
229
+ 95,
230
+ 88,
231
+ 88,
232
+ 178,
233
+ 178,
234
+ 87,
235
+ 87,
236
+ 14,
237
+ 14,
238
+ 317,
239
+ 317,
240
+ 402,
241
+ 402,
242
+ 318,
243
+ 318,
244
+ 324,
245
+ 324,
246
+ 308,
247
+ 78,
248
+ 191,
249
+ 191,
250
+ 80,
251
+ 80,
252
+ 81,
253
+ 81,
254
+ 82,
255
+ 82,
256
+ 13,
257
+ 13,
258
+ 312,
259
+ 312,
260
+ 311,
261
+ 311,
262
+ 310,
263
+ 310,
264
+ 415,
265
+ 415,
266
+ 308,
267
+ ]
268
+ # mouth_landmarks = [13, 14, 15, 16, 17, 308, 309, 310, 311, 312]
269
+ mouth_x = [int(np.clip(face_landmarks[idx].x, 0, 1) * w) for idx in mouth_landmarks]
270
+ mouth_y = [int(np.clip(face_landmarks[idx].y, 0, 1) * h) for idx in mouth_landmarks]
271
+ mouth_bbox = [(min(mouth_x), min(mouth_y)), (max(mouth_x), max(mouth_y))]
272
+ mouth_p = np.array([(mouth_bbox[0][0] + mouth_bbox[1][0]) / 2, (mouth_bbox[1][0] + mouth_bbox[1][1]) / 2])
273
+ mouth_bbox = convert_bbox_to_square_bbox(mouth_bbox, max_h, max_w, scale=mouth_bbox_scale)
274
+
275
+ nose_landmarks = [48, 115, 220, 45, 4, 275, 440, 344, 278]
276
+ nose_x = [int(np.clip(face_landmarks[idx].x, 0, 1) * w) for idx in nose_landmarks]
277
+ nose_y = [int(np.clip(face_landmarks[idx].y, 0, 1) * h) for idx in nose_landmarks]
278
+ nose_bbox = [(min(nose_x), min(nose_y)), (max(nose_x), max(nose_y))]
279
+ nose_p = np.array([(nose_bbox[0][0] + nose_bbox[1][0]) / 2, (nose_bbox[1][0] + nose_bbox[1][1]) / 2])
280
+
281
+ # width = mouth_bbox[1][0] - mouth_bbox[0][0]
282
+ # height = mouth_bbox[1][1] - mouth_bbox[0][1]
283
+ # max_size = max(width, height) * 1.2
284
+ # center_x = (mouth_bbox[0][0] + mouth_bbox[1][0]) / 2
285
+ # center_y = (mouth_bbox[0][1] + mouth_bbox[1][1]) / 2
286
+ # left_up = (int(center_x - max_size/2), int(center_y - max_size/2))
287
+ # right_bottom = (int(center_x + max_size/2), int(center_y + max_size/2))
288
+ # mouth_bbox = [left_up, right_bottom]
289
+
290
+ all_mouth_bounding_box.append(mouth_bbox)
291
+
292
+ # Get eye bounding boxes (left eye: landmarks 33-133, right eye: landmarks 362-263)
293
+ left_eye_landmarks = [362, 398, 384, 385, 386, 387, 388, 466, 263, 249, 390, 373, 374, 380, 381, 382]
294
+ right_eye_landmarks = [33, 246, 161, 160, 159, 158, 157, 173, 133, 155, 154, 153, 145, 144, 163, 7]
295
+
296
+ left_eye_x = [int(np.clip(face_landmarks[idx].x, 0, 1) * w) for idx in left_eye_landmarks]
297
+ left_eye_y = [int(np.clip(face_landmarks[idx].y, 0, 1) * h) for idx in left_eye_landmarks]
298
+ left_eye_bbox = [(min(left_eye_x), min(left_eye_y)), (max(left_eye_x), max(left_eye_y))]
299
+ left_size = max(left_eye_y) - min(left_eye_y)
300
+ left_eye_p = np.array([(left_eye_bbox[0][0] + left_eye_bbox[1][0]) / 2, (left_eye_bbox[1][0] + left_eye_bbox[1][1]) / 2])
301
+ left_eye_bbox = convert_bbox_to_square_bbox(left_eye_bbox, max_h, max_w, scale=eye_bbox_scale)
302
+
303
+ right_eye_x = [int(np.clip(face_landmarks[idx].x, 0, 1) * w) for idx in right_eye_landmarks]
304
+ right_eye_y = [int(np.clip(face_landmarks[idx].y, 0, 1) * h) for idx in right_eye_landmarks]
305
+ right_eye_bbox = [(min(right_eye_x), min(right_eye_y)), (max(right_eye_x), max(right_eye_y))]
306
+ right_size = max(right_eye_y) - min(right_eye_y)
307
+ right_eye_p = np.array([(right_eye_bbox[0][0] + right_eye_bbox[1][0]) / 2, (right_eye_bbox[1][0] + right_eye_bbox[1][1]) / 2])
308
+ right_eye_bbox = convert_bbox_to_square_bbox(right_eye_bbox, max_h, max_w, scale=eye_bbox_scale)
309
+
310
+ eye_bbox = {"left_eye": left_eye_bbox, "right_eye": right_eye_bbox}
311
+
312
+ all_eye_bounding_box.append(eye_bbox)
313
+
314
+ face_contour = np.zeros_like(image)
315
+ for landmark_id, landmark in enumerate(face_landmarks):
316
+ cx, cy = int(landmark.x * w), int(landmark.y * h)
317
+ if cy >= max_h or cx >= max_w: continue
318
+ if cy < 0 or cx < 0: continue
319
+ face_contour[cy, cx] = (255, 255, 255)
320
+
321
+ eyeball = np.zeros_like(image)
322
+ for landmark_id, landmark in enumerate(face_landmarks):
323
+ cx, cy = int(landmark.x * w), int(landmark.y * h)
324
+ if landmark_id not in [468, 473]: continue
325
+ if cy >= max_h or cx >= max_w: continue
326
+ if cy < 0 or cx < 0: continue
327
+ radius = int(left_size // 3) if landmark_id == 468 else int(right_size // 3)
328
+ cv2.circle(eyeball, (cx, cy), radius=radius, color=(255, 0, 0), thickness=-1)
329
+ eyeball_mask = (eyeball.sum(axis=2) != 0)[:, :, None]
330
+
331
+ all_eyeball.append(eyeball)
332
+ all_eyeball_mask.append(eyeball_mask)
333
+ all_face_contour.append(face_contour)
334
+ all_mouth_p.append(mouth_p)
335
+ all_nose_p.append(nose_p)
336
+ all_left_eye_p.append(left_eye_p)
337
+ all_right_eye_p.append(right_eye_p)
338
+
339
+ if save_vis:
340
+ x_min, y_min, x_max, y_max = mouth_bbox
341
+ cv2.rectangle(image, (x_min, y_min), (x_max, y_max), (0, 0, 255), 2)
342
+
343
+ for eye_key, bbox in eye_bbox.items():
344
+ x_min, y_min, x_max, y_max = bbox
345
+ color = (0, 0, 255)
346
+ cv2.rectangle(image, (x_min, y_min), (x_max, y_max), color, 2)
347
+
348
+ for landmark_id, landmark in enumerate(face_landmarks):
349
+ cx, cy = int(landmark.x * w), int(landmark.y * h)
350
+ circle_size = 2
351
+ if landmark_id in mouth_landmarks:
352
+ cv2.circle(image, (cx, cy), circle_size, (0, 0, 255), -1)
353
+ elif landmark_id in left_eye_landmarks+right_eye_landmarks:
354
+ cv2.circle(image, (cx, cy), circle_size, (0, 255, 0), -1)
355
+ else:
356
+ cv2.circle(image, (cx, cy), circle_size, (255, 255, 255), -1)
357
+ cv2.imwrite('image_detect.png', image[:,:,::-1])
358
+ # import pdb; pdb.set_trace()
359
+
360
+ return (
361
+ all_x,
362
+ all_y,
363
+ all_orientation,
364
+ num_faces,
365
+ all_keypoints,
366
+ all_bounding_box,
367
+ all_mouth_bounding_box,
368
+ all_eye_bounding_box,
369
+ all_face_contour,
370
+ all_blendshapes,
371
+ all_facial_transformation_matrices,
372
+ annotated_image,
373
+ all_mouth_p, # 12
374
+ all_nose_p, # 13
375
+ all_left_eye_p, # 14
376
+ all_right_eye_p, # 15
377
+ all_eyeball, # 16
378
+ all_eyeball_mask, # 17
379
+ )
380
+
381
+ def get_face_xy_rotation_and_keypoints(self, image, mouth_bbox_scale = 1.2, eye_bbox_scale = 1.5, annotate_image: bool = False, save_vis=False):
382
+ mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=image)
383
+
384
+ # get facial rotation
385
+ results = self.detector.detect(mp_image)
386
+ max_h, max_w = image.shape[:2]
387
+
388
+ if annotate_image:
389
+ annotated_image = draw_landmarks_on_image(image, results)
390
+ else:
391
+ annotated_image = None
392
+
393
+ all_x = []
394
+ all_y = []
395
+ all_orientation = []
396
+ all_keypoints = []
397
+ all_bounding_box = []
398
+ all_mouth_bounding_box = []
399
+ all_eye_bounding_box = []
400
+ all_face_contour = []
401
+ all_blendshapes = []
402
+ num_faces = len(results.face_landmarks)
403
+
404
+ for face_blendshapes in results.face_blendshapes:
405
+ blendshapes = [item.score for item in face_blendshapes]
406
+ all_blendshapes.append(blendshapes)
407
+
408
+ all_facial_transformation_matrices = results.facial_transformation_matrixes
409
+
410
+ for face_landmarks in results.face_landmarks:
411
+ keypoints = []
412
+ bounding_box = []
413
+
414
+ h, w = image.shape[0], image.shape[1]
415
+ cx_min, cy_min = w, h
416
+ cx_max, cy_max = 0, 0
417
+ for idx, lm in enumerate(face_landmarks):
418
+ # Clip landmarks if they go off the image
419
+ cx, cy = int(np.clip(lm.x, 0, 1) * w), int(np.clip(lm.y, 0, 1) * h)
420
+
421
+ if cx < cx_min:
422
+ cx_min = cx
423
+ if cy < cy_min:
424
+ cy_min = cy
425
+ if cx > cx_max:
426
+ cx_max = cx
427
+ if cy > cy_max:
428
+ cy_max = cy
429
+
430
+ keypoints.append((lm.x, lm.y, lm.z))
431
+
432
+ if idx == 137:
433
+ right_cheek = (lm.x, lm.y, lm.z)
434
+ if idx == 366:
435
+ left_cheek = (lm.x, lm.y, lm.z)
436
+ if idx == 4:
437
+ nose = (lm.x, lm.y, lm.z)
438
+
439
+ # get vector from middle of face to tip of nose
440
+ face_middle = (
441
+ (right_cheek[0] + left_cheek[0]) / 2.0,
442
+ (right_cheek[1] + left_cheek[1]) / 2.0,
443
+ )
444
+
445
+ x = nose[0] - face_middle[0]
446
+ y = nose[1] - face_middle[1]
447
+
448
+ if x > 0.15:
449
+ orientation = "left"
450
+ elif x < -0.15:
451
+ orientation = "right"
452
+ else:
453
+ orientation = "forward"
454
+
455
+ bounding_box = [(cx_min, cy_min), (cx_max, cy_max)]
456
+
457
+ all_keypoints.append(keypoints)
458
+ all_bounding_box.append(bounding_box)
459
+ all_x.append(x)
460
+ all_y.append(y)
461
+ all_orientation.append(orientation)
462
+
463
+ # Get mouth bounding box (landmarks 13-17 and 308-312)
464
+ mouth_landmarks = [
465
+ 61,
466
+ 146,
467
+ 146,
468
+ 91,
469
+ 91,
470
+ 181,
471
+ 181,
472
+ 84,
473
+ 84,
474
+ 17,
475
+ 17,
476
+ 314,
477
+ 314,
478
+ 405,
479
+ 405,
480
+ 321,
481
+ 321,
482
+ 375,
483
+ 375,
484
+ 291,
485
+ 61,
486
+ 185,
487
+ 185,
488
+ 40,
489
+ 40,
490
+ 39,
491
+ 39,
492
+ 37,
493
+ 37,
494
+ 0,
495
+ 0,
496
+ 267,
497
+ 267,
498
+ 269,
499
+ 269,
500
+ 270,
501
+ 270,
502
+ 409,
503
+ 409,
504
+ 291,
505
+ 78,
506
+ 95,
507
+ 95,
508
+ 88,
509
+ 88,
510
+ 178,
511
+ 178,
512
+ 87,
513
+ 87,
514
+ 14,
515
+ 14,
516
+ 317,
517
+ 317,
518
+ 402,
519
+ 402,
520
+ 318,
521
+ 318,
522
+ 324,
523
+ 324,
524
+ 308,
525
+ 78,
526
+ 191,
527
+ 191,
528
+ 80,
529
+ 80,
530
+ 81,
531
+ 81,
532
+ 82,
533
+ 82,
534
+ 13,
535
+ 13,
536
+ 312,
537
+ 312,
538
+ 311,
539
+ 311,
540
+ 310,
541
+ 310,
542
+ 415,
543
+ 415,
544
+ 308,
545
+ ]
546
+ # mouth_landmarks = [13, 14, 15, 16, 17, 308, 309, 310, 311, 312]
547
+ mouth_x = [int(np.clip(face_landmarks[idx].x, 0, 1) * w) for idx in mouth_landmarks]
548
+ mouth_y = [int(np.clip(face_landmarks[idx].y, 0, 1) * h) for idx in mouth_landmarks]
549
+ mouth_bbox = [(min(mouth_x), min(mouth_y)), (max(mouth_x), max(mouth_y))]
550
+ mouth_bbox = convert_bbox_to_square_bbox(mouth_bbox, max_h, max_w, scale=mouth_bbox_scale)
551
+
552
+ # width = mouth_bbox[1][0] - mouth_bbox[0][0]
553
+ # height = mouth_bbox[1][1] - mouth_bbox[0][1]
554
+ # max_size = max(width, height) * 1.2
555
+ # center_x = (mouth_bbox[0][0] + mouth_bbox[1][0]) / 2
556
+ # center_y = (mouth_bbox[0][1] + mouth_bbox[1][1]) / 2
557
+ # left_up = (int(center_x - max_size/2), int(center_y - max_size/2))
558
+ # right_bottom = (int(center_x + max_size/2), int(center_y + max_size/2))
559
+ # mouth_bbox = [left_up, right_bottom]
560
+
561
+ all_mouth_bounding_box.append(mouth_bbox)
562
+
563
+ # Get eye bounding boxes (left eye: landmarks 33-133, right eye: landmarks 362-263)
564
+ left_eye_landmarks = [362, 398, 384, 385, 386, 387, 388, 466, 263, 249, 390, 373, 374, 380, 381, 382]
565
+ right_eye_landmarks = [33, 246, 161, 160, 159, 158, 157, 173, 133, 155, 154, 153, 145, 144, 163, 7]
566
+
567
+ left_eye_x = [int(np.clip(face_landmarks[idx].x, 0, 1) * w) for idx in left_eye_landmarks]
568
+ left_eye_y = [int(np.clip(face_landmarks[idx].y, 0, 1) * h) for idx in left_eye_landmarks]
569
+ left_eye_bbox = [(min(left_eye_x), min(left_eye_y)), (max(left_eye_x), max(left_eye_y))]
570
+ left_eye_bbox = convert_bbox_to_square_bbox(left_eye_bbox, max_h, max_w, scale=eye_bbox_scale)
571
+
572
+ right_eye_x = [int(np.clip(face_landmarks[idx].x, 0, 1) * w) for idx in right_eye_landmarks]
573
+ right_eye_y = [int(np.clip(face_landmarks[idx].y, 0, 1) * h) for idx in right_eye_landmarks]
574
+ right_eye_bbox = [(min(right_eye_x), min(right_eye_y)), (max(right_eye_x), max(right_eye_y))]
575
+ right_eye_bbox = convert_bbox_to_square_bbox(right_eye_bbox, max_h, max_w, scale=eye_bbox_scale)
576
+
577
+ eye_bbox = {"left_eye": left_eye_bbox, "right_eye": right_eye_bbox}
578
+
579
+ all_eye_bounding_box.append(eye_bbox)
580
+
581
+ face_contour = np.zeros_like(image)
582
+ for landmark_id, landmark in enumerate(face_landmarks):
583
+ cx, cy = int(landmark.x * w), int(landmark.y * h)
584
+ if cy >= max_h or cx >= max_w: continue
585
+ if cy < 0 or cx < 0: continue
586
+ face_contour[cy, cx] = (255, 255, 255)
587
+ all_face_contour.append(face_contour)
588
+
589
+ if save_vis:
590
+ import cv2
591
+ x_min, y_min, x_max, y_max = mouth_bbox
592
+ cv2.rectangle(image, (x_min, y_min), (x_max, y_max), (0, 0, 255), 2)
593
+
594
+ for eye_key, bbox in eye_bbox.items():
595
+ x_min, y_min, x_max, y_max = bbox
596
+ color = (0, 0, 255)
597
+ cv2.rectangle(image, (x_min, y_min), (x_max, y_max), color, 2)
598
+
599
+ for landmark_id, landmark in enumerate(face_landmarks):
600
+ cx, cy = int(landmark.x * w), int(landmark.y * h)
601
+ circle_size = 2
602
+ if landmark_id in mouth_landmarks:
603
+ cv2.circle(image, (cx, cy), circle_size, (0, 0, 255), -1)
604
+ elif landmark_id in left_eye_landmarks+right_eye_landmarks:
605
+ cv2.circle(image, (cx, cy), circle_size, (0, 255, 0), -1)
606
+ else:
607
+ cv2.circle(image, (cx, cy), circle_size, (255, 255, 255), -1)
608
+ cv2.imwrite('image_detect.png', image[:,:,::-1])
609
+ # import pdb; pdb.set_trace()
610
+
611
+ return (
612
+ all_x,
613
+ all_y,
614
+ all_orientation,
615
+ num_faces,
616
+ all_keypoints,
617
+ all_bounding_box,
618
+ all_mouth_bounding_box,
619
+ all_eye_bounding_box,
620
+ all_face_contour,
621
+ all_blendshapes,
622
+ all_facial_transformation_matrices,
623
+ annotated_image,
624
+ )
tools/visualization_0416/utils/face_landmarker.task ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64184e229b263107bc2b804c6625db1341ff2bb731874b0bcc2fe6544e0bc9ff
3
+ size 3758596