upload ckpt

Browse files

Files changed (11) hide show

checkpoints/last.ckpt +3 -0
tools/pretrained_model/epoch=0-step=312000.ckpt +3 -0
tools/visualization_0416/configs/audio_head_animator.yaml +154 -0
tools/visualization_0416/configs/head_animator_best_0506.yaml +153 -0
tools/visualization_0416/img_to_latent.py +71 -0
tools/visualization_0416/img_to_mask.py +199 -0
tools/visualization_0416/latent_to_video.py +156 -0
tools/visualization_0416/latent_to_video_batch.py +249 -0
tools/visualization_0416/utils/__init__.py +0 -0
tools/visualization_0416/utils/face_detector.py +624 -0
tools/visualization_0416/utils/face_landmarker.task +3 -0

checkpoints/last.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:15baf05196834ca54b4da7c2c9fd372b572e650b1edd84cb5a92c4be1689f29b
+size 7730126719

tools/pretrained_model/epoch=0-step=312000.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a9f06f55912dfa12ea18d77315f1c1675c5f67669097db8f155b4a4d75f9d31d
+size 1579672001

tools/visualization_0416/configs/audio_head_animator.yaml ADDED Viewed

	@@ -0,0 +1,154 @@

+# 此配置文件主要用于 img_to_mask.py 获取 face detection 相关参数
+debug: false
+seed: 39
+root_name: audio_head_animator
+exp_name: ${root_name}/inference
+mode: train
+n_epochs: null
+cache_dir: cache
+ckpt_dir: ${exp_name}/ckpt
+resume_ckpt: null
+only_resume_state_dict: False
+pretrained_ckpt: null
+model:
+  module_name: model.head_animation.head_animator
+  class_name: HeadAnimatorModule
+  pretrained_ckpt: ${pretrained_ckpt}
+  using_hybrid_mask: True
+  output_dir: ${exp_name}
+  face_encoder:
+    module_name: model.head_animation.LIA_3d.face_encoder
+    class_name: FaceEncoder
+    image_size: 512
+    image_channel: 3
+    block_expansion: 64
+    num_down_blocks: 3
+    max_features: 512
+    reshape_channel: 32
+    reshape_depth: 16
+    num_resblocks: 6
+  motion_encoder:
+    module_name: model.head_animation.LIA_3d.motion_encoder
+    class_name: MotionEncoder
+    latent_dim: 512
+    size: ${model.face_encoder.image_size}
+  flow_estimator:
+    module_name: model.head_animation.LIA_3d.flow_estimator
+    class_name: FlowEstimator
+    latent_dim: ${model.motion_encoder.latent_dim}
+    motion_space: 64
+  face_generator:
+    module_name: model.head_animation.LIA_3d.face_generator
+    class_name: FaceGenerator
+    size:  ${model.face_encoder.image_size}
+    latent_dim: ${model.motion_encoder.latent_dim}
+    outputsize: ${data.train_width}
+    reshape_channel: ${model.face_encoder.reshape_channel}
+    group_norm_channel: 32
+    flag_estimate_occlusion_map: True
+  discriminator:
+    module_name: model.head_animation.LIA.discriminator
+    class_name: Discriminator
+    size: ${data.train_width}
+  vgg_loss:
+    module_name: model.head_animation.VASA1.loss
+    class_name: VGGLoss
+loss:
+  l_w_recon: 1
+  l_w_face_l1: 0
+  l_w_vgg: 2
+  l_w_gan: 0.2
+  l_w_face: 0
+  l_w_headpose: 0
+  l_w_gaze: 0
+  l_w_foreground: 0
+  l_w_local: 0
+optimizer:
+  lr: 0.0001
+  discriminator_lr: 0.002
+  warmup_steps: 0
+  adam_beta1: 0.9
+  adam_beta2: 0.999
+  adam_epsilon: 1.0e-08
+  weight_decay: 0.0
+  g_reg_every: 4
+  d_reg_every: 16
+logger:
+  neptune_project: null
+  neptune_api_token: null
+  wandb:
+    enabled: false
+    entity: null
+    project: "real-time"
+callbacks:
+  - module_name: lightning.pytorch.callbacks
+    class_name: ModelCheckpoint
+    dirpath: ${ckpt_dir}
+    every_n_train_steps: 2000
+    save_top_k: -1
+trainer:
+  accelerator: gpu
+  log_every_n_steps: 1
+  val_check_interval: 100000
+data:
+  debug: False
+  train_bs: 12
+  accumulate_grad_batches: 1
+  n_sample_frames: 1
+  past_n: 1
+  num_workers: 8
+  ref_sample_margin: 10
+  train_width: 512
+  train_height: 512
+  union_bbox_scale: [1.2, 1.4]
+  mouth_bbox_scale: 1.5
+  eye_bbox_scale: 2.0
+  hybrid_face_mask: ${model.using_hybrid_mask}
+  flip_aug: True
+  filter_hand_videos: true
+  random_sample: False
+  dataset_file_path: []
+  cache_file_path: []
+  train_fps: 25
+  dataloader: FastVideoDatasetV2
+val_data:
+  train_bs: 1
+  n_sample_frames: 40
+  past_n: 2
+  num_workers: 6
+  ref_sample_margin: ${data.ref_sample_margin}
+  train_width: ${data.train_width}
+  train_height: ${data.train_height}
+  union_bbox_scale: [1.2, 1.4]
+  mouth_bbox_scale: ${data.mouth_bbox_scale}
+  eye_bbox_scale: ${data.eye_bbox_scale}
+  hybrid_face_mask: ${data.hybrid_face_mask}
+  flip_aug: False
+  filter_hand_videos: ${data.filter_hand_videos}
+  random_sample: False
+  dataset_file_path: []
+  train_fps: ${data.train_fps}
+  dataloader: ${data.dataloader}
+test_data:
+  height: 384
+  width: 672
+  image_paths_and_scales: []
+inference:
+  output_dir: inference_outputs/${exp_name}

tools/visualization_0416/configs/head_animator_best_0506.yaml ADDED Viewed

	@@ -0,0 +1,153 @@

+debug: false
+seed: 39
+root_name: head_animator_LIA3D
+exp_name: ${root_name}/inference
+mode: train
+n_epochs: null
+cache_dir: cache
+ckpt_dir: ${exp_name}/ckpt
+resume_ckpt: ../pretrained_model/epoch=0-step=312000.ckpt
+only_resume_state_dict: False
+pretrained_ckpt: null
+model:
+  module_name: model.head_animation.head_animator
+  class_name: HeadAnimatorModule
+  pretrained_ckpt: ${pretrained_ckpt}
+  using_hybrid_mask: True
+  output_dir: ${exp_name}
+  face_encoder:
+    module_name: model.head_animation.LIA_3d.face_encoder
+    class_name: FaceEncoder
+    image_size: 512
+    image_channel: 3
+    block_expansion: 64
+    num_down_blocks: 3
+    max_features: 512
+    reshape_channel: 32
+    reshape_depth: 16
+    num_resblocks: 6
+  motion_encoder:
+    module_name: model.head_animation.LIA_3d.motion_encoder
+    class_name: MotionEncoder
+    latent_dim: 512
+    size: ${model.face_encoder.image_size}
+  flow_estimator:
+    module_name: model.head_animation.LIA_3d.flow_estimator
+    class_name: FlowEstimator
+    latent_dim: ${model.motion_encoder.latent_dim}
+    motion_space: 64
+  face_generator:
+    module_name: model.head_animation.LIA_3d.face_generator
+    class_name: FaceGenerator
+    size:  ${model.face_encoder.image_size}
+    latent_dim: ${model.motion_encoder.latent_dim}
+    outputsize: ${data.train_width}
+    reshape_channel: ${model.face_encoder.reshape_channel}
+    group_norm_channel: 32
+    flag_estimate_occlusion_map: True
+  discriminator:
+    module_name: model.head_animation.LIA.discriminator
+    class_name: Discriminator
+    size: ${data.train_width}
+  vgg_loss:
+    module_name: model.head_animation.VASA1.loss
+    class_name: VGGLoss
+loss:
+  l_w_recon: 1
+  l_w_face_l1: 0
+  l_w_vgg: 2
+  l_w_gan: 0.2
+  l_w_face: 0
+  l_w_headpose: 0
+  l_w_gaze: 0
+  l_w_foreground: 0
+  l_w_local: 0
+optimizer:
+  lr: 0.0001
+  discriminator_lr: 0.002
+  warmup_steps: 0
+  adam_beta1: 0.9
+  adam_beta2: 0.999
+  adam_epsilon: 1.0e-08
+  weight_decay: 0.0
+  g_reg_every: 4
+  d_reg_every: 16
+logger:
+  neptune_project: null
+  neptune_api_token: null
+  wandb:
+    enabled: false
+    entity: null
+    project: "real-time"
+callbacks:
+  - module_name: lightning.pytorch.callbacks
+    class_name: ModelCheckpoint
+    dirpath: ${ckpt_dir}
+    every_n_train_steps: 2000
+    save_top_k: -1
+trainer:
+  accelerator: gpu
+  log_every_n_steps: 1
+  val_check_interval: 100000
+data:
+  debug: False
+  train_bs: 12
+  accumulate_grad_batches: 1
+  n_sample_frames: 1
+  past_n: 1
+  num_workers: 8
+  ref_sample_margin: 10
+  train_width: 512
+  train_height: 512
+  union_bbox_scale: [1.2, 1.4]
+  mouth_bbox_scale: 1.5
+  eye_bbox_scale: 2.0
+  hybrid_face_mask: ${model.using_hybrid_mask}
+  flip_aug: True
+  filter_hand_videos: true
+  random_sample: False
+  dataset_file_path: []
+  cache_file_path: []
+  train_fps: 25
+  dataloader: FastVideoDatasetV2
+val_data:
+  train_bs: 1
+  n_sample_frames: 40
+  past_n: 2
+  num_workers: 6
+  ref_sample_margin: ${data.ref_sample_margin}
+  train_width: ${data.train_width}
+  train_height: ${data.train_height}
+  union_bbox_scale: [1.2, 1.4]
+  mouth_bbox_scale: ${data.mouth_bbox_scale}
+  eye_bbox_scale: ${data.eye_bbox_scale}
+  hybrid_face_mask: ${data.hybrid_face_mask}
+  flip_aug: False
+  filter_hand_videos: ${data.filter_hand_videos}
+  random_sample: False
+  dataset_file_path: []
+  train_fps: ${data.train_fps}
+  dataloader: ${data.dataloader}
+test_data:
+  height: 384
+  width: 672
+  image_paths_and_scales: []
+inference:
+  output_dir: inference_outputs/${exp_name}

tools/visualization_0416/img_to_latent.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import sys
+import os
+# 获取项目根目录并添加到 sys.path 最前面，确保导入正确的 utils 模块
+_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+_PROJECT_ROOT = os.path.abspath(os.path.join(_SCRIPT_DIR, '..', '..'))
+if _PROJECT_ROOT not in sys.path:
+    sys.path.insert(0, _PROJECT_ROOT)
+import numpy as np
+import torch
+from PIL import Image
+import torchvision.transforms as T
+from omegaconf import OmegaConf
+import fire
+def init_fn(config_path):
+    from utils import instantiate
+    transform = T.Compose([T.Resize((512, 512)), T.ToTensor(), T.Normalize([0.5], [0.5])])
+    config = OmegaConf.load(config_path)
+    module = instantiate(config.model, instantiate_module=False)
+    model = module(config=config)
+    checkpoint = torch.load(config.resume_ckpt, map_location="cpu")
+    model.load_state_dict(checkpoint["state_dict"], strict=False)
+    model.eval()
+    motion_encoder = model.motion_encoder
+    return {"transform": transform, "motion_encoder": motion_encoder}
+def extract_motion_latent(
+    mask_image_path='./test_case/test_img_masked.png',
+    config_path='./configs/head_animator_best_0506.yaml',
+    save_npz_path='./test_case/test_img_resize.npz',
+    version="0506"):
+    sys.path.insert(0, f'./utils/model_{version}')
+    config_path = config_path.replace("0506", version)
+    context = init_fn(config_path)
+    transform = context["transform"]
+    motion_encoder = context["motion_encoder"]
+    img = Image.open(mask_image_path).convert("RGB")
+    img_tensor = transform(img).unsqueeze(0)
+    with torch.no_grad():
+        latent = motion_encoder(img_tensor)[0]  # [1, 512]
+    latent_np = latent.numpy()
+    # 如果文件已存在，先加载原有数据
+    if os.path.exists(save_npz_path):
+        existing_data = np.load(save_npz_path, allow_pickle=True)
+        data_dict = dict(existing_data)
+        existing_data.close()  # 关闭文件
+    else:
+        data_dict = {}
+    # 更新或添加新的键值对
+    data_dict.update({
+        'video_id': os.path.basename(save_npz_path)[:-4],
+        'mask_img_path': mask_image_path,
+        'ref_img_path': save_npz_path.replace('npz', 'png'),
+        'motion_latent': latent_np
+    })
+    # 保存更新后的数据
+    np.savez(save_npz_path, **data_dict)
+    # np.savez(
+    #     save_npz_path,
+    #     video_id=os.path.basename(save_npz_path)[:-4],
+    #     mask_img_path=mask_image_path,
+    #     ref_img_path=save_npz_path.replace('npz', 'png'),
+    #     motion_latent=latent_np
+    # )
+if __name__ == '__main__':
+    fire.Fire(extract_motion_latent)

tools/visualization_0416/img_to_mask.py ADDED Viewed

	@@ -0,0 +1,199 @@

+"""
+input: image_path
+output: save a masked image and resized image
+"""
+import os
+import sys
+import urllib.request
+import numpy as np
+import torch
+import cv2
+from PIL import Image
+from omegaconf import OmegaConf
+from torchvision import transforms
+from utils.face_detector import FaceDetector
+from pathlib import Path
+def generate_crop_bounding_box(h, w, center, size=512):
+    """
+    Crop a region of a specified size from the given center point,
+    filling the area outside the image boundary with zeros.
+    :param image: The input image in NumPy array form, shape (H, W, C)
+    :param center: The center point (y, x) to start cropping from
+    :param size: The size of the cropped region (default is 512)
+    :return: The cropped region with padding, shape (size, size, C)
+    """
+    half_size = size // 2  # Half the size for the cropping region
+    # Calculate the top-left and bottom-right coordinates of the cropping region
+    y1 = max(center[0] - half_size, 0)  # Ensure the y1 index is not less than 0
+    x1 = max(center[1] - half_size, 0)  # Ensure the x1 index is not less than 0
+    y2 = min(center[0] + half_size, h)  # Ensure the y2 index does not exceed the image height
+    x2 = min(center[1] + half_size, w)  # Ensure the x2 index does not exceed the image width
+    return [x1, y1, x2, y2]
+def crop_from_bbox(image, center, bbox, size=512):
+    """
+    Crop a region of a specified size from the given center point,
+    filling the area outside the image boundary with zeros.
+    :param image: The input image in NumPy array form, shape (H, W, C)
+    :param center: The center point (y, x) to start cropping from
+    :param size: The size of the cropped region (default is 512)
+    :return: The cropped region with padding, shape (size, size, C)
+    """
+    h, w = image.shape[:2]  # Get the height and width of the image
+    x1, y1, x2, y2 = bbox
+    half_size = size // 2  # Half the size for the cropping region
+    # Create a zero-filled array for padding
+    cropped = np.zeros((size, size, image.shape[2]), dtype=image.dtype)
+    # Copy the valid region from the original image to the cropped region
+    cropped[(y1 - (center[0] - half_size)):(y2 - (center[0] - half_size)),
+            (x1 - (center[1] - half_size)):(x2 - (center[1] - half_size))] = image[y1:y2, x1:x2]
+    return cropped
+face_detector = None
+model_path = "./utils/face_landmarker.task"
+if not os.path.exists(model_path):
+    print("Downloading face landmarker model...")
+    url = "https://storage.googleapis.com/mediapipe-models/face_landmarker/face_landmarker/float16/1/face_landmarker.task"
+    urllib.request.urlretrieve(url, model_path)
+def initialize_face_detector():
+    global face_detector
+    if face_detector is None:
+        face_detector = FaceDetector(
+            mediapipe_model_asset_path=model_path,
+            face_detection_confidence=0.5,
+            num_faces=1,
+        )
+initialize_face_detector()
+def augmentation(images, transform, state=None):
+    if state is not None:
+        torch.set_rng_state(state)
+    if isinstance(images, list):
+        transformed = [transforms.functional.to_tensor(img) for img in images]
+        return transform(torch.stack(transformed, dim=0))
+    return transform(transforms.functional.to_tensor(images))
+def scale_bbox(bbox, h, w, scale=1.8):
+    sw = (bbox[2] - bbox[0]) / 2
+    sh = (bbox[3] - bbox[1]) / 2
+    cx = (bbox[0] + bbox[2]) / 2
+    cy = (bbox[1] + bbox[3]) / 2
+    sw *= scale
+    sh *= scale
+    scaled = [cx - sw, cy - sh, cx + sw, cy + sh]
+    scaled[0] = np.clip(scaled[0], 0, w)
+    scaled[2] = np.clip(scaled[2], 0, w)
+    scaled[1] = np.clip(scaled[1], 0, h)
+    scaled[3] = np.clip(scaled[3], 0, h)
+    return scaled
+def get_mask(bbox, hd, wd, scale=1.0, return_pil=True):
+    if min(bbox) < 0:
+        raise Exception("Invalid mask")
+    bbox = scale_bbox(bbox, hd, wd, scale=scale)
+    x0, y0, x1, y1 = [int(v) for v in bbox]
+    mask = np.zeros((hd, wd, 3), dtype=np.uint8)
+    mask[y0:y1, x0:x1, :] = 255
+    if return_pil:
+        return Image.fromarray(mask)
+    return mask
+def generate_masked_image(
+        image_path="./test_case/test_img.png",
+        save_path="./test_case/test_img.png",
+        crop=False,
+        union_bbox_scale=1.3):
+    cfg = OmegaConf.load("./configs/audio_head_animator.yaml")
+    pixel_transform = transforms.Compose([
+        transforms.Resize(512, interpolation=transforms.InterpolationMode.BICUBIC),
+        transforms.Normalize([0.5], [0.5]),
+    ])
+    resize_transform = transforms.Resize((512, 512), interpolation=transforms.InterpolationMode.BICUBIC)
+    img = Image.open(image_path).convert("RGB")
+    state = torch.get_rng_state()
+    # Get face detection results first
+    det_res = face_detector.get_face_xy_rotation_and_keypoints(
+        np.array(img), cfg.data.mouth_bbox_scale, cfg.data.eye_bbox_scale
+    )
+    person_id = 0
+    mouth_bbox = np.array(det_res[6][person_id])
+    eye_bbox = det_res[7][person_id]
+    face_contour = np.array(det_res[8][person_id])
+    left_eye_bbox = eye_bbox["left_eye"]
+    right_eye_bbox = eye_bbox["right_eye"]
+    # If crop is True, crop the face region first
+    if crop:
+        # Get the face bounding box and calculate center
+        face_bbox = det_res[5][person_id]  # Get the face bounding box from det_res[5]
+        # face_bbox is [(x1, y1), (x2, y2)]
+        x1, y1 = face_bbox[0]
+        x2, y2 = face_bbox[1]
+        center = [(y1 + y2) // 2, (x1 + x2) // 2]
+        # Calculate the size for cropping
+        width = x2 - x1
+        height = y2 - y1
+        max_size = int(max(width, height) * union_bbox_scale)
+        # Get the image dimensions
+        hd, wd = img.size[1], img.size[0]
+        # Generate the crop bounding box
+        crop_bbox = generate_crop_bounding_box(hd, wd, center, max_size)
+        # Crop the image
+        img_array = np.array(img)
+        cropped_img = crop_from_bbox(img_array, center, crop_bbox, size=max_size)
+        img = Image.fromarray(cropped_img)
+        # Update the face detection results for the cropped image
+        det_res = face_detector.get_face_xy_rotation_and_keypoints(
+            cropped_img, cfg.data.mouth_bbox_scale, cfg.data.eye_bbox_scale
+        )
+        mouth_bbox = np.array(det_res[6][person_id])
+        eye_bbox = det_res[7][person_id]
+        face_contour = np.array(det_res[8][person_id])
+        left_eye_bbox = eye_bbox["left_eye"]
+        right_eye_bbox = eye_bbox["right_eye"]
+    pixel_values_ref = augmentation([img], pixel_transform, state)
+    pixel_values_ref = (pixel_values_ref + 1) / 2
+    new_hd, new_wd = img.size[1], img.size[0]
+    mouth_mask = resize_transform(get_mask(mouth_bbox, new_hd, new_wd, scale=1.0))
+    left_eye_mask = resize_transform(get_mask(left_eye_bbox, new_hd, new_wd, scale=1.0))
+    right_eye_mask = resize_transform(get_mask(right_eye_bbox, new_hd, new_wd, scale=1.0))
+    face_contour = resize_transform(Image.fromarray(face_contour))
+    eye_mask = np.bitwise_or(np.array(left_eye_mask), np.array(right_eye_mask))
+    combined_mask = np.bitwise_or(eye_mask, np.array(mouth_mask))
+    combined_mask_tensor = torch.from_numpy(combined_mask / 255.0).permute(2, 0, 1).unsqueeze(0)
+    face_contour_tensor = torch.from_numpy(np.array(face_contour) / 255.0).permute(2, 0, 1).unsqueeze(0)
+    masked_ref = pixel_values_ref * combined_mask_tensor + face_contour_tensor * (1 - combined_mask_tensor)
+    masked_ref = masked_ref.clamp(0, 1)
+    masked_ref_np = (masked_ref.squeeze(0).permute(1, 2, 0).cpu().numpy() * 255).astype(np.uint8)
+    base, _ = os.path.splitext(save_path)
+    resized_img = (pixel_values_ref.squeeze(0).permute(1, 2, 0).cpu().numpy().clip(0, 1) * 255).astype(np.uint8)
+    Image.fromarray(resized_img).save(f"{base}_resize.png")
+    Image.fromarray(masked_ref_np).save(f"{base}_masked.png")
+if __name__ == '__main__':
+    import fire
+    fire.Fire(generate_masked_image)
+    # python img_to_mask.py --image_path /mnt/weka/haiyang_workspace/ckpts/good_train_case/image_example/KristiNoem2-Scene-001.png --save_path /mnt/weka/haiyang_workspace/ckpts/good_train_case/image_example/KristiNoem2-Scene-001.png --crop True --union_bbox_scale 1.6
+    # python img_to_latent.py --mask_image_path ./test_case/ChrisVanHollen0-Scene-003_masked.png --save_npz_path ./test_case/ChrisVanHollen0-Scene-003_resize.npz
+    # python latent_two_video.py --npz_path ./test_case/ChrisVanHollen0-Scene-003_resize.npz --save_dir ./test_case/

tools/visualization_0416/latent_to_video.py ADDED Viewed

	@@ -0,0 +1,156 @@

+import sys
+import os
+# 获取项目根目录并添加到 sys.path 最前面，确保导入正确的 utils 模块
+_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+_PROJECT_ROOT = os.path.abspath(os.path.join(_SCRIPT_DIR, '..', '..'))
+if _PROJECT_ROOT not in sys.path:
+    sys.path.insert(0, _PROJECT_ROOT)
+import numpy as np
+import torch
+from PIL import Image
+import torchvision.transforms as T
+from omegaconf import OmegaConf
+import fire
+import imageio
+import moviepy.editor as mp
+from tqdm import tqdm
+def init_fn(config_path, version):
+    sys.path.insert(0, f'./utils/model_{version}')
+    from utils import instantiate
+    config = OmegaConf.load(config_path)
+    module = instantiate(config.model, instantiate_module=False)
+    model = module(config=config)
+    checkpoint = torch.load(config.resume_ckpt, map_location="cpu")
+    model.load_state_dict(checkpoint["state_dict"], strict=False)
+    model.eval().to("cuda")
+    transform = T.Compose([
+        T.Resize((512, 512)),
+        T.ToTensor(),
+        T.Normalize([0.5], [0.5]),
+    ])
+    return {
+        "transform": transform,
+        "flow_estimator": model.flow_estimator,
+        "face_generator": model.face_generator,
+        "face_encoder": model.face_encoder,
+    }
+def latent_to_video(
+    npz_dir="./test_case/",
+    save_dir="./test_case/",
+    save_fps: int = 25,
+    config_path: str = './configs/head_animator_best_0416.yaml',
+    version: str = '0416',
+):
+    # 处理相对路径：
+    # - npz_dir 和 save_dir：如果是相对路径，转换为基于项目根目录的绝对路径
+    # - config_path：如果是相对路径，转换为基于当前脚本目录（tools/visualization_0416/）的绝对路径
+    if not os.path.isabs(npz_dir):
+        npz_dir = os.path.join(_PROJECT_ROOT, npz_dir)
+    if not os.path.isabs(save_dir):
+        save_dir = os.path.join(_PROJECT_ROOT, save_dir)
+    if not os.path.isabs(config_path):
+        config_path = os.path.join(_SCRIPT_DIR, config_path)
+    # 规范化路径（去除多余的 . 和 ..）
+    npz_dir = os.path.normpath(npz_dir)
+    save_dir = os.path.normpath(save_dir)
+    config_path = os.path.normpath(config_path)
+    os.makedirs(save_dir, exist_ok=True)
+    # 只在文件名上做版本号替换，避免把路径里的 "0416" 一并替换成 "0506"
+    config_dir = os.path.dirname(config_path)
+    config_name = os.path.basename(config_path)
+    config_name = config_name.replace("0416", version)
+    config_path = os.path.join(config_dir, config_name)
+    # Initialize models only once
+    print("Initializing models...")
+    print(f"NPZ directory: {npz_dir}")
+    print(f"Save directory: {save_dir}")
+    ctx = init_fn(config_path, version)
+    transform = ctx["transform"]
+    flow_estimator = ctx["flow_estimator"]
+    face_generator = ctx["face_generator"]
+    face_encoder = ctx["face_encoder"]
+    # Get all npz files
+    if not os.path.exists(npz_dir):
+        print(f"Error: NPZ directory does not exist: {npz_dir}")
+        return
+    npz_files = [f for f in os.listdir(npz_dir) if f.endswith('_output.npz')]
+    print(f"Found {len(npz_files)} files to process")
+    # Process each file
+    for npz_file in tqdm(npz_files, desc="Processing files"):
+        if not npz_file.endswith('.npz'): continue
+        try:
+            npz_path = os.path.join(npz_dir, npz_file)
+            data = np.load(npz_path, allow_pickle=True)
+            motion_latent = torch.from_numpy(data["motion_latent"]).to("cuda").float()
+            if len(motion_latent.shape) == 3:
+                motion_latent = motion_latent.squeeze(0)
+            num_frames = motion_latent.shape[0]
+            print(f"\nProcessing {npz_file} with {num_frames} frames")
+            # 处理 ref_img_path - 如果是相对路径，基于项目根目录解析
+            ref_img_path = str(data["ref_img_path"])
+            if not os.path.isabs(ref_img_path):
+                ref_img_path = os.path.join(_PROJECT_ROOT, ref_img_path)
+            ref_img = Image.open(ref_img_path).convert("RGB")
+            ref_img = transform(ref_img).unsqueeze(0).to("cuda")
+            # np.save("/mnt/weka/haiyang_workspace/ckpts/good_train_case/image_example/face_encoder_input.npy", ref_img.cpu().numpy())
+            with torch.no_grad():
+                face_feat = face_encoder(ref_img)
+                # np.save("/mnt/weka/haiyang_workspace/ckpts/good_train_case/image_example/face_encoder_output.npy", face_feat.cpu().numpy())
+                recon_list = []
+                for i in range(0, num_frames):
+                    tgt = flow_estimator(motion_latent[0:1], motion_latent[i:i+1])
+                    recon_list.append(face_generator(tgt, face_feat))
+            recon = torch.cat(recon_list, dim=0)
+            video_np = recon.permute(0, 2, 3, 1).cpu().numpy()
+            video_np = np.clip((video_np + 1) / 2 * 255, 0, 255).astype("uint8")
+            video_id = str(data["video_id"])
+            # Remove leading dash to prevent FFMPEG command line parsing issues
+            if video_id.startswith('-'):
+                video_id = video_id[1:]
+            if num_frames == 1:
+                out_path = os.path.join(save_dir, f"{video_id}_rec.png")
+                Image.fromarray(video_np[0]).save(out_path)
+            else:
+                temp_mp4 = os.path.join(save_dir, f"{video_id}_temp.mp4")
+                final_mp4 = os.path.join(save_dir, f"{video_id}.mp4")
+                finalfinal_mp4 = os.path.join(save_dir, f"{str(data['video_id'])}.mp4")
+                with imageio.get_writer(temp_mp4, fps=save_fps) as writer:
+                    for frame in video_np:
+                        writer.append_data(frame)
+                # 处理 audio_path - 如果是相对路径，基于项目根目录解析
+                audio_path = str(data["audio_path"]) if "audio_path" in data.files else None
+                if audio_path and not os.path.isabs(audio_path):
+                    audio_path = os.path.join(_PROJECT_ROOT, audio_path)
+                if audio_path and os.path.exists(audio_path):
+                    clip = mp.VideoFileClip(temp_mp4)
+                    audio = mp.AudioFileClip(audio_path)
+                    clip.set_audio(audio).write_videofile(final_mp4, codec="libx264", audio_codec="aac")
+                    clip.close()
+                    audio.close()
+                    os.remove(temp_mp4)
+                else:
+                    os.rename(temp_mp4, final_mp4)
+                os.rename(final_mp4, finalfinal_mp4)
+        except Exception as e:
+            print(f"Error processing {npz_file}: {str(e)}")
+            continue
+if __name__ == "__main__":
+    fire.Fire(latent_to_video)
+    # Example usage:
+    # python latent_to_video.py --npz_dir ./test_case/ --save_dir ./test_case/ --config_path ./configs/head_animator_best_0409.yaml --version 0416

tools/visualization_0416/latent_to_video_batch.py ADDED Viewed

	@@ -0,0 +1,249 @@

+"""
+批处理优化版本的 latent_to_video
+相比原版逐帧处理，使用批处理加速约 10-30 倍
+v2: 优化 GPU→CPU 传输和视频编码，使用流式处理
+"""
+import sys
+import os
+# 获取项目根目录并添加到 sys.path 最前面，确保导入正确的 utils 模块
+_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+_PROJECT_ROOT = os.path.abspath(os.path.join(_SCRIPT_DIR, '..', '..'))
+if _PROJECT_ROOT not in sys.path:
+    sys.path.insert(0, _PROJECT_ROOT)
+import numpy as np
+import torch
+from PIL import Image
+import torchvision.transforms as T
+from omegaconf import OmegaConf
+import fire
+import imageio
+import moviepy.editor as mp
+from tqdm import tqdm
+import time
+import subprocess
+import tempfile
+def init_fn(config_path, version):
+    sys.path.insert(0, f'./utils/model_{version}')
+    from utils import instantiate
+    config = OmegaConf.load(config_path)
+    module = instantiate(config.model, instantiate_module=False)
+    model = module(config=config)
+    checkpoint = torch.load(config.resume_ckpt, map_location="cpu")
+    model.load_state_dict(checkpoint["state_dict"], strict=False)
+    model.eval().to("cuda")
+    transform = T.Compose([
+        T.Resize((512, 512)),
+        T.ToTensor(),
+        T.Normalize([0.5], [0.5]),
+    ])
+    return {
+        "transform": transform,
+        "flow_estimator": model.flow_estimator,
+        "face_generator": model.face_generator,
+        "face_encoder": model.face_encoder,
+    }
+def latent_to_video_batch(
+    npz_dir="./test_case/",
+    save_dir="./test_case/",
+    save_fps: int = 25,
+    config_path: str = './configs/head_animator_best_0416.yaml',
+    version: str = '0416',
+    batch_size: int = 32,
+    use_fp16: bool = True,
+):
+    """
+    批处理优化版本的 latent_to_video
+    Args:
+        npz_dir: NPZ 文件目录
+        save_dir: 输出视频目录
+        save_fps: 输出视频帧率
+        config_path: 模型配置文件路径
+        version: 模型版本
+        batch_size: 批处理大小，根据显存调整 (默认 32，显存不足可降到 16 或 8)
+        use_fp16: 是否使用混合精度加速 (默认 True)
+    """
+    os.makedirs(save_dir, exist_ok=True)
+    config_path = config_path.replace("0416", version)
+    # Initialize models only once
+    print("Initializing models...")
+    ctx = init_fn(config_path, version)
+    transform = ctx["transform"]
+    flow_estimator = ctx["flow_estimator"]
+    face_generator = ctx["face_generator"]
+    face_encoder = ctx["face_encoder"]
+    # Get all npz files
+    npz_files = [f for f in os.listdir(npz_dir) if f.endswith('_output.npz')]
+    print(f"Found {len(npz_files)} files to process")
+    print(f"Batch size: {batch_size}, FP16: {use_fp16}")
+    total_frames = 0
+    total_time = 0
+    # Process each file
+    for npz_file in tqdm(npz_files, desc="Processing files"):
+        if not npz_file.endswith('.npz'):
+            continue
+        try:
+            npz_path = os.path.join(npz_dir, npz_file)
+            data = np.load(npz_path, allow_pickle=True)
+            motion_latent = torch.from_numpy(data["motion_latent"]).to("cuda").float()
+            if len(motion_latent.shape) == 3:
+                motion_latent = motion_latent.squeeze(0)
+            num_frames = motion_latent.shape[0]
+            print(f"\nProcessing {npz_file} with {num_frames} frames")
+            # 处理 ref_img_path - 如果是相对路径，基于项目根目录解析
+            ref_img_path = str(data["ref_img_path"])
+            if not os.path.isabs(ref_img_path):
+                ref_img_path = os.path.join(_PROJECT_ROOT, ref_img_path)
+            ref_img = Image.open(ref_img_path).convert("RGB")
+            ref_img = transform(ref_img).unsqueeze(0).to("cuda")
+            video_id = str(data["video_id"])
+            # Remove leading dash to prevent FFMPEG command line parsing issues
+            if video_id.startswith('-'):
+                video_id = video_id[1:]
+            # 处理 audio_path
+            audio_path = str(data["audio_path"]) if "audio_path" in data.files else None
+            if audio_path and not os.path.isabs(audio_path):
+                audio_path = os.path.join(_PROJECT_ROOT, audio_path)
+            start_time = time.time()
+            # 准备输出路径
+            temp_mp4 = os.path.join(save_dir, f"{video_id}_temp.mp4")
+            final_mp4 = os.path.join(save_dir, f"{video_id}.mp4")
+            finalfinal_mp4 = os.path.join(save_dir, f"{str(data['video_id'])}.mp4")
+            if num_frames == 1:
+                # 单帧情况
+                with torch.no_grad():
+                    with torch.cuda.amp.autocast(enabled=use_fp16):
+                        face_feat = face_encoder(ref_img)
+                        tgt = flow_estimator(motion_latent[0:1], motion_latent[0:1])
+                        recon = face_generator(tgt, face_feat)
+                        if use_fp16:
+                            recon = recon.float()
+                video_np = recon.permute(0, 2, 3, 1).cpu().numpy()
+                video_np = np.clip((video_np + 1) / 2 * 255, 0, 255).astype("uint8")
+                out_path = os.path.join(save_dir, f"{video_id}_rec.png")
+                Image.fromarray(video_np[0]).save(out_path)
+            else:
+                # 多帧情况 - 使用 FFmpeg pipe 流式编码
+                # 启动 FFmpeg 进程
+                ffmpeg_cmd = [
+                    'ffmpeg', '-y',
+                    '-f', 'rawvideo',
+                    '-vcodec', 'rawvideo',
+                    '-s', '512x512',
+                    '-pix_fmt', 'rgb24',
+                    '-r', str(save_fps),
+                    '-i', '-',
+                    '-c:v', 'libx264',
+                    '-preset', 'fast',
+                    '-crf', '18',
+                    '-pix_fmt', 'yuv420p',
+                    temp_mp4
+                ]
+                ffmpeg_process = subprocess.Popen(
+                    ffmpeg_cmd,
+                    stdin=subprocess.PIPE,
+                    stdout=subprocess.DEVNULL,
+                    stderr=subprocess.DEVNULL
+                )
+                with torch.no_grad():
+                    with torch.cuda.amp.autocast(enabled=use_fp16):
+                        face_feat = face_encoder(ref_img)  # (1, 32, 16, 64, 64)
+                        ref_latent = motion_latent[0:1]  # 参考帧的 latent
+                        # 批处理推理 + 流式写入
+                        for i in range(0, num_frames, batch_size):
+                            batch_end = min(i + batch_size, num_frames)
+                            current_batch_size = batch_end - i
+                            # 获取当前批次的 motion latent
+                            batch_motion = motion_latent[i:batch_end]
+                            # 扩展参考帧 latent 到批次大小
+                            ref_latent_expanded = ref_latent.expand(current_batch_size, -1)
+                            # 扩展 face_feat 到批次大小
+                            face_feat_expanded = face_feat.expand(current_batch_size, -1, -1, -1, -1)
+                            # 批量计算 flow
+                            tgt = flow_estimator(ref_latent_expanded, batch_motion)
+                            # 批量生成图像
+                            recon = face_generator(tgt, face_feat_expanded)
+                            # 转换并写入 - 直接在 GPU 上做归一化
+                            # (batch, 3, 512, 512) -> (batch, 512, 512, 3)
+                            recon = recon.float()
+                            recon = (recon + 1) / 2 * 255
+                            recon = recon.clamp(0, 255).to(torch.uint8)
+                            recon = recon.permute(0, 2, 3, 1).contiguous()
+                            # 分块传输到 CPU 并写入
+                            frames_np = recon.cpu().numpy()
+                            ffmpeg_process.stdin.write(frames_np.tobytes())
+                # 关闭 FFmpeg
+                ffmpeg_process.stdin.close()
+                ffmpeg_process.wait()
+                elapsed = time.time() - start_time
+                total_frames += num_frames
+                total_time += elapsed
+                fps = num_frames / elapsed
+                print(f"  Rendered + encoded {num_frames} frames in {elapsed:.2f}s ({fps:.1f} fps)")
+                # 合并音频
+                if audio_path and os.path.exists(audio_path):
+                    # 使用 FFmpeg 直接合并音频（比 moviepy 快很多）
+                    final_with_audio = os.path.join(save_dir, f"{video_id}_with_audio.mp4")
+                    ffmpeg_audio_cmd = [
+                        'ffmpeg', '-y',
+                        '-i', temp_mp4,
+                        '-i', audio_path,
+                        '-c:v', 'copy',
+                        '-c:a', 'aac',
+                        '-shortest',
+                        final_with_audio
+                    ]
+                    subprocess.run(ffmpeg_audio_cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+                    os.remove(temp_mp4)
+                    os.rename(final_with_audio, finalfinal_mp4)
+                else:
+                    os.rename(temp_mp4, finalfinal_mp4)
+        except Exception as e:
+            import traceback
+            print(f"Error processing {npz_file}: {str(e)}")
+            traceback.print_exc()
+            continue
+    # 打印总体统计
+    if total_time > 0:
+        print(f"\n{'='*50}")
+        print(f"总计: {total_frames} 帧, {total_time:.2f} ��")
+        print(f"平均渲染速度: {total_frames / total_time:.1f} fps")
+        print(f"{'='*50}")
+if __name__ == "__main__":
+    fire.Fire(latent_to_video_batch)
+    # Example usage:
+    # python latent_to_video_batch.py --npz_dir ./test_case/ --save_dir ./test_case/ --batch_size 32 --use_fp16 True

tools/visualization_0416/utils/__init__.py ADDED Viewed

File without changes

tools/visualization_0416/utils/face_detector.py ADDED Viewed

	@@ -0,0 +1,624 @@

+import mediapipe as mp
+from mediapipe import solutions
+from mediapipe.framework.formats import landmark_pb2
+import numpy as np
+import cv2
+def convert_bbox_to_square_bbox(bbox, max_h, max_w, scale=1.0):
+    # Calculate width, height, and max_size of the bounding box
+    width = bbox[1][0] - bbox[0][0]
+    height = bbox[1][1] - bbox[0][1]
+    max_size = max(width, height) * scale
+    # Calculate center of the bounding box
+    center_x = (bbox[0][0] + bbox[1][0]) / 2
+    center_y = (bbox[0][1] + bbox[1][1]) / 2
+    # Calculate the left-up and right-bottom corners of the square bounding box
+    half_size = max_size / 2
+    left_top = [int(center_x - half_size), int(center_y - half_size)]
+    right_bottom = [int(center_x + half_size), int(center_y + half_size)]
+    # Ensure the square is within image bounds
+    left_top[0] = max(0, left_top[0])
+    left_top[1] = max(0, left_top[1])
+    right_bottom[0] = min(max_w, right_bottom[0])
+    right_bottom[1] = min(max_h, right_bottom[1])
+    # Return the new bounding box as a list of top-left and bottom-right coordinates
+    return [left_top[0], left_top[1], right_bottom[0], right_bottom[1]]
+def draw_landmarks_on_image(rgb_image, detection_result):
+    face_landmarks_list = detection_result.face_landmarks
+    annotated_image = np.copy(rgb_image)
+    # Loop through the detected faces to visualize.
+    for idx in range(len(face_landmarks_list)):
+        face_landmarks = face_landmarks_list[idx]
+        # Draw the face landmarks.
+        face_landmarks_proto = landmark_pb2.NormalizedLandmarkList()
+        face_landmarks_proto.landmark.extend(
+            [
+                landmark_pb2.NormalizedLandmark(
+                    x=landmark.x, y=landmark.y, z=landmark.z
+                )
+                for landmark in face_landmarks
+            ]
+        )
+        solutions.drawing_utils.draw_landmarks(
+            image=annotated_image,
+            landmark_list=face_landmarks_proto,
+            connections=mp.solutions.face_mesh.FACEMESH_TESSELATION,
+            landmark_drawing_spec=None,
+            connection_drawing_spec=mp.solutions.drawing_styles.get_default_face_mesh_tesselation_style(),
+        )
+        solutions.drawing_utils.draw_landmarks(
+            image=annotated_image,
+            landmark_list=face_landmarks_proto,
+            connections=mp.solutions.face_mesh.FACEMESH_CONTOURS,
+            landmark_drawing_spec=None,
+            connection_drawing_spec=mp.solutions.drawing_styles.get_default_face_mesh_contours_style(),
+        )
+        solutions.drawing_utils.draw_landmarks(
+            image=annotated_image,
+            landmark_list=face_landmarks_proto,
+            connections=mp.solutions.face_mesh.FACEMESH_IRISES,
+            landmark_drawing_spec=None,
+            connection_drawing_spec=mp.solutions.drawing_styles.get_default_face_mesh_iris_connections_style(),
+        )
+    return annotated_image
+class FaceDetector:
+    def __init__(self, mediapipe_model_asset_path, delegate=1, face_detection_confidence=0.5, num_faces=5):
+        # Create a face landmarker instance with the video mode:
+        options = mp.tasks.vision.FaceLandmarkerOptions(
+            base_options=mp.tasks.BaseOptions(
+                model_asset_path=mediapipe_model_asset_path,
+                # delegate=mp.tasks.BaseOptions.Delegate.GPU,
+                # TODO: why does the gpu version not work in docker???
+                delegate=delegate,
+            ),
+            running_mode=mp.tasks.vision.RunningMode.IMAGE,
+            num_faces=num_faces,
+            output_face_blendshapes=True,
+            output_facial_transformation_matrixes=True,
+            min_face_detection_confidence=face_detection_confidence,
+            min_face_presence_confidence=face_detection_confidence,
+            min_tracking_confidence=face_detection_confidence,
+        )
+        self.detector = mp.tasks.vision.FaceLandmarker.create_from_options(options)
+    def get_one_face_xy_rotation_and_keypoints(self, image, mouth_bbox_scale = 1.2, eye_bbox_scale = 1.5, annotate_image: bool = False, save_vis=False):
+        mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=image)
+        # get facial rotation
+        results = self.detector.detect(mp_image)
+        max_h, max_w = image.shape[:2]
+        if annotate_image:
+            annotated_image = draw_landmarks_on_image(image, results)
+        else:
+            annotated_image = None
+        all_x = []
+        all_y = []
+        all_orientation = []
+        all_keypoints = []
+        all_bounding_box = []
+        all_mouth_bounding_box = []
+        all_eye_bounding_box = []
+        all_face_contour = []
+        all_eyeball = []
+        all_eyeball_mask = []
+        all_blendshapes = []
+        all_mouth_p = []
+        all_nose_p = []
+        all_left_eye_p = []
+        all_right_eye_p = []
+        num_faces = len(results.face_landmarks)
+        for face_blendshapes in results.face_blendshapes:
+            blendshapes = [item.score for item in face_blendshapes]
+            all_blendshapes.append(blendshapes)
+        all_facial_transformation_matrices = results.facial_transformation_matrixes
+        for face_landmarks in results.face_landmarks:
+            keypoints = []
+            bounding_box = []
+            h, w = image.shape[0], image.shape[1]
+            cx_min, cy_min = w, h
+            cx_max, cy_max = 0, 0
+            for idx, lm in enumerate(face_landmarks):
+                # Clip landmarks if they go off the image
+                cx, cy = int(np.clip(lm.x, 0, 1) * w), int(np.clip(lm.y, 0, 1) * h)
+                if cx < cx_min:
+                    cx_min = cx
+                if cy < cy_min:
+                    cy_min = cy
+                if cx > cx_max:
+                    cx_max = cx
+                if cy > cy_max:
+                    cy_max = cy
+                keypoints.append((lm.x, lm.y, lm.z))
+                if idx == 137:
+                    right_cheek = (lm.x, lm.y, lm.z)
+                if idx == 366:
+                    left_cheek = (lm.x, lm.y, lm.z)
+                if idx == 4:
+                    nose = (lm.x, lm.y, lm.z)
+            # get vector from middle of face to tip of nose
+            face_middle = (
+                (right_cheek[0] + left_cheek[0]) / 2.0,
+                (right_cheek[1] + left_cheek[1]) / 2.0,
+            )
+            x = nose[0] - face_middle[0]
+            y = nose[1] - face_middle[1]
+            if x > 0.15:
+                orientation = "left"
+            elif x < -0.15:
+                orientation = "right"
+            else:
+                orientation = "forward"
+            bounding_box = [(cx_min, cy_min), (cx_max, cy_max)]
+            all_keypoints.append(keypoints)
+            all_bounding_box.append(bounding_box)
+            all_x.append(x)
+            all_y.append(y)
+            all_orientation.append(orientation)
+            # Get mouth bounding box (landmarks 13-17 and 308-312)
+            mouth_landmarks = [
+                61,
+                146,
+                146,
+                91,
+                91,
+                181,
+                181,
+                84,
+                84,
+                17,
+                17,
+                314,
+                314,
+                405,
+                405,
+                321,
+                321,
+                375,
+                375,
+                291,
+                61,
+                185,
+                185,
+                40,
+                40,
+                39,
+                39,
+                37,
+                37,
+                0,
+                0,
+                267,
+                267,
+                269,
+                269,
+                270,
+                270,
+                409,
+                409,
+                291,
+                78,
+                95,
+                95,
+                88,
+                88,
+                178,
+                178,
+                87,
+                87,
+                14,
+                14,
+                317,
+                317,
+                402,
+                402,
+                318,
+                318,
+                324,
+                324,
+                308,
+                78,
+                191,
+                191,
+                80,
+                80,
+                81,
+                81,
+                82,
+                82,
+                13,
+                13,
+                312,
+                312,
+                311,
+                311,
+                310,
+                310,
+                415,
+                415,
+                308,
+            ]
+            # mouth_landmarks = [13, 14, 15, 16, 17, 308, 309, 310, 311, 312]
+            mouth_x = [int(np.clip(face_landmarks[idx].x, 0, 1) * w) for idx in mouth_landmarks]
+            mouth_y = [int(np.clip(face_landmarks[idx].y, 0, 1) * h) for idx in mouth_landmarks]
+            mouth_bbox = [(min(mouth_x), min(mouth_y)), (max(mouth_x), max(mouth_y))]
+            mouth_p = np.array([(mouth_bbox[0][0] + mouth_bbox[1][0]) / 2, (mouth_bbox[1][0] + mouth_bbox[1][1]) / 2])
+            mouth_bbox = convert_bbox_to_square_bbox(mouth_bbox, max_h, max_w, scale=mouth_bbox_scale)
+            nose_landmarks = [48, 115, 220, 45, 4, 275, 440, 344, 278]
+            nose_x = [int(np.clip(face_landmarks[idx].x, 0, 1) * w) for idx in nose_landmarks]
+            nose_y = [int(np.clip(face_landmarks[idx].y, 0, 1) * h) for idx in nose_landmarks]
+            nose_bbox = [(min(nose_x), min(nose_y)), (max(nose_x), max(nose_y))]
+            nose_p = np.array([(nose_bbox[0][0] + nose_bbox[1][0]) / 2, (nose_bbox[1][0] + nose_bbox[1][1]) / 2])
+            # width = mouth_bbox[1][0] - mouth_bbox[0][0]
+            # height = mouth_bbox[1][1] - mouth_bbox[0][1]
+            # max_size = max(width, height) * 1.2
+            # center_x = (mouth_bbox[0][0] + mouth_bbox[1][0]) / 2
+            # center_y = (mouth_bbox[0][1] + mouth_bbox[1][1]) / 2
+            # left_up = (int(center_x - max_size/2), int(center_y - max_size/2))
+            # right_bottom = (int(center_x + max_size/2), int(center_y + max_size/2))
+            # mouth_bbox = [left_up, right_bottom]
+            all_mouth_bounding_box.append(mouth_bbox)
+            # Get eye bounding boxes (left eye: landmarks 33-133, right eye: landmarks 362-263)
+            left_eye_landmarks = [362, 398, 384, 385, 386, 387, 388, 466, 263, 249, 390, 373, 374, 380, 381, 382]
+            right_eye_landmarks = [33, 246, 161, 160, 159, 158, 157, 173, 133, 155, 154, 153, 145, 144, 163, 7]
+            left_eye_x = [int(np.clip(face_landmarks[idx].x, 0, 1) * w) for idx in left_eye_landmarks]
+            left_eye_y = [int(np.clip(face_landmarks[idx].y, 0, 1) * h) for idx in left_eye_landmarks]
+            left_eye_bbox = [(min(left_eye_x), min(left_eye_y)), (max(left_eye_x), max(left_eye_y))]
+            left_size = max(left_eye_y) - min(left_eye_y)
+            left_eye_p = np.array([(left_eye_bbox[0][0] + left_eye_bbox[1][0]) / 2, (left_eye_bbox[1][0] + left_eye_bbox[1][1]) / 2])
+            left_eye_bbox = convert_bbox_to_square_bbox(left_eye_bbox, max_h, max_w, scale=eye_bbox_scale)
+            right_eye_x = [int(np.clip(face_landmarks[idx].x, 0, 1) * w) for idx in right_eye_landmarks]
+            right_eye_y = [int(np.clip(face_landmarks[idx].y, 0, 1) * h) for idx in right_eye_landmarks]
+            right_eye_bbox = [(min(right_eye_x), min(right_eye_y)), (max(right_eye_x), max(right_eye_y))]
+            right_size = max(right_eye_y) - min(right_eye_y)
+            right_eye_p = np.array([(right_eye_bbox[0][0] + right_eye_bbox[1][0]) / 2, (right_eye_bbox[1][0] + right_eye_bbox[1][1]) / 2])
+            right_eye_bbox = convert_bbox_to_square_bbox(right_eye_bbox, max_h, max_w, scale=eye_bbox_scale)
+            eye_bbox = {"left_eye": left_eye_bbox, "right_eye": right_eye_bbox}
+            all_eye_bounding_box.append(eye_bbox)
+            face_contour = np.zeros_like(image)
+            for landmark_id, landmark in enumerate(face_landmarks):
+                cx, cy = int(landmark.x * w), int(landmark.y * h)
+                if cy >= max_h or cx >= max_w: continue
+                if cy < 0 or cx < 0: continue
+                face_contour[cy, cx] = (255, 255, 255)
+            eyeball = np.zeros_like(image)
+            for landmark_id, landmark in enumerate(face_landmarks):
+                cx, cy = int(landmark.x * w), int(landmark.y * h)
+                if landmark_id not in [468, 473]: continue
+                if cy >= max_h or cx >= max_w: continue
+                if cy < 0 or cx < 0: continue
+                radius = int(left_size // 3) if landmark_id == 468 else int(right_size // 3)
+                cv2.circle(eyeball, (cx, cy), radius=radius, color=(255, 0, 0), thickness=-1)
+                eyeball_mask = (eyeball.sum(axis=2) != 0)[:, :, None]
+            all_eyeball.append(eyeball)
+            all_eyeball_mask.append(eyeball_mask)
+            all_face_contour.append(face_contour)
+            all_mouth_p.append(mouth_p)
+            all_nose_p.append(nose_p)
+            all_left_eye_p.append(left_eye_p)
+            all_right_eye_p.append(right_eye_p)
+            if save_vis:
+                x_min, y_min, x_max, y_max = mouth_bbox
+                cv2.rectangle(image, (x_min, y_min), (x_max, y_max), (0, 0, 255), 2)
+                for eye_key, bbox in eye_bbox.items():
+                    x_min, y_min, x_max, y_max = bbox
+                    color = (0, 0, 255)
+                    cv2.rectangle(image, (x_min, y_min), (x_max, y_max), color, 2)
+                for landmark_id, landmark in enumerate(face_landmarks):
+                    cx, cy = int(landmark.x * w), int(landmark.y * h)
+                    circle_size = 2
+                    if landmark_id in mouth_landmarks:
+                        cv2.circle(image, (cx, cy), circle_size, (0, 0, 255), -1)
+                    elif landmark_id in left_eye_landmarks+right_eye_landmarks:
+                        cv2.circle(image, (cx, cy), circle_size, (0, 255, 0), -1)
+                    else:
+                        cv2.circle(image, (cx, cy), circle_size, (255, 255, 255), -1)
+                cv2.imwrite('image_detect.png', image[:,:,::-1])
+                # import pdb; pdb.set_trace()
+        return (
+            all_x,
+            all_y,
+            all_orientation,
+            num_faces,
+            all_keypoints,
+            all_bounding_box,
+            all_mouth_bounding_box,
+            all_eye_bounding_box,
+            all_face_contour,
+            all_blendshapes,
+            all_facial_transformation_matrices,
+            annotated_image,
+            all_mouth_p, # 12
+            all_nose_p, # 13
+            all_left_eye_p, # 14
+            all_right_eye_p, # 15
+            all_eyeball, # 16
+            all_eyeball_mask, # 17
+        )
+    def get_face_xy_rotation_and_keypoints(self, image, mouth_bbox_scale = 1.2, eye_bbox_scale = 1.5, annotate_image: bool = False, save_vis=False):
+        mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=image)
+        # get facial rotation
+        results = self.detector.detect(mp_image)
+        max_h, max_w = image.shape[:2]
+        if annotate_image:
+            annotated_image = draw_landmarks_on_image(image, results)
+        else:
+            annotated_image = None
+        all_x = []
+        all_y = []
+        all_orientation = []
+        all_keypoints = []
+        all_bounding_box = []
+        all_mouth_bounding_box = []
+        all_eye_bounding_box = []
+        all_face_contour = []
+        all_blendshapes = []
+        num_faces = len(results.face_landmarks)
+        for face_blendshapes in results.face_blendshapes:
+            blendshapes = [item.score for item in face_blendshapes]
+            all_blendshapes.append(blendshapes)
+        all_facial_transformation_matrices = results.facial_transformation_matrixes
+        for face_landmarks in results.face_landmarks:
+            keypoints = []
+            bounding_box = []
+            h, w = image.shape[0], image.shape[1]
+            cx_min, cy_min = w, h
+            cx_max, cy_max = 0, 0
+            for idx, lm in enumerate(face_landmarks):
+                # Clip landmarks if they go off the image
+                cx, cy = int(np.clip(lm.x, 0, 1) * w), int(np.clip(lm.y, 0, 1) * h)
+                if cx < cx_min:
+                    cx_min = cx
+                if cy < cy_min:
+                    cy_min = cy
+                if cx > cx_max:
+                    cx_max = cx
+                if cy > cy_max:
+                    cy_max = cy
+                keypoints.append((lm.x, lm.y, lm.z))
+                if idx == 137:
+                    right_cheek = (lm.x, lm.y, lm.z)
+                if idx == 366:
+                    left_cheek = (lm.x, lm.y, lm.z)
+                if idx == 4:
+                    nose = (lm.x, lm.y, lm.z)
+            # get vector from middle of face to tip of nose
+            face_middle = (
+                (right_cheek[0] + left_cheek[0]) / 2.0,
+                (right_cheek[1] + left_cheek[1]) / 2.0,
+            )
+            x = nose[0] - face_middle[0]
+            y = nose[1] - face_middle[1]
+            if x > 0.15:
+                orientation = "left"
+            elif x < -0.15:
+                orientation = "right"
+            else:
+                orientation = "forward"
+            bounding_box = [(cx_min, cy_min), (cx_max, cy_max)]
+            all_keypoints.append(keypoints)
+            all_bounding_box.append(bounding_box)
+            all_x.append(x)
+            all_y.append(y)
+            all_orientation.append(orientation)
+            # Get mouth bounding box (landmarks 13-17 and 308-312)
+            mouth_landmarks = [
+                61,
+                146,
+                146,
+                91,
+                91,
+                181,
+                181,
+                84,
+                84,
+                17,
+                17,
+                314,
+                314,
+                405,
+                405,
+                321,
+                321,
+                375,
+                375,
+                291,
+                61,
+                185,
+                185,
+                40,
+                40,
+                39,
+                39,
+                37,
+                37,
+                0,
+                0,
+                267,
+                267,
+                269,
+                269,
+                270,
+                270,
+                409,
+                409,
+                291,
+                78,
+                95,
+                95,
+                88,
+                88,
+                178,
+                178,
+                87,
+                87,
+                14,
+                14,
+                317,
+                317,
+                402,
+                402,
+                318,
+                318,
+                324,
+                324,
+                308,
+                78,
+                191,
+                191,
+                80,
+                80,
+                81,
+                81,
+                82,
+                82,
+                13,
+                13,
+                312,
+                312,
+                311,
+                311,
+                310,
+                310,
+                415,
+                415,
+                308,
+            ]
+            # mouth_landmarks = [13, 14, 15, 16, 17, 308, 309, 310, 311, 312]
+            mouth_x = [int(np.clip(face_landmarks[idx].x, 0, 1) * w) for idx in mouth_landmarks]
+            mouth_y = [int(np.clip(face_landmarks[idx].y, 0, 1) * h) for idx in mouth_landmarks]
+            mouth_bbox = [(min(mouth_x), min(mouth_y)), (max(mouth_x), max(mouth_y))]
+            mouth_bbox = convert_bbox_to_square_bbox(mouth_bbox, max_h, max_w, scale=mouth_bbox_scale)
+            # width = mouth_bbox[1][0] - mouth_bbox[0][0]
+            # height = mouth_bbox[1][1] - mouth_bbox[0][1]
+            # max_size = max(width, height) * 1.2
+            # center_x = (mouth_bbox[0][0] + mouth_bbox[1][0]) / 2
+            # center_y = (mouth_bbox[0][1] + mouth_bbox[1][1]) / 2
+            # left_up = (int(center_x - max_size/2), int(center_y - max_size/2))
+            # right_bottom = (int(center_x + max_size/2), int(center_y + max_size/2))
+            # mouth_bbox = [left_up, right_bottom]
+            all_mouth_bounding_box.append(mouth_bbox)
+            # Get eye bounding boxes (left eye: landmarks 33-133, right eye: landmarks 362-263)
+            left_eye_landmarks = [362, 398, 384, 385, 386, 387, 388, 466, 263, 249, 390, 373, 374, 380, 381, 382]
+            right_eye_landmarks = [33, 246, 161, 160, 159, 158, 157, 173, 133, 155, 154, 153, 145, 144, 163, 7]
+            left_eye_x = [int(np.clip(face_landmarks[idx].x, 0, 1) * w) for idx in left_eye_landmarks]
+            left_eye_y = [int(np.clip(face_landmarks[idx].y, 0, 1) * h) for idx in left_eye_landmarks]
+            left_eye_bbox = [(min(left_eye_x), min(left_eye_y)), (max(left_eye_x), max(left_eye_y))]
+            left_eye_bbox = convert_bbox_to_square_bbox(left_eye_bbox, max_h, max_w, scale=eye_bbox_scale)
+            right_eye_x = [int(np.clip(face_landmarks[idx].x, 0, 1) * w) for idx in right_eye_landmarks]
+            right_eye_y = [int(np.clip(face_landmarks[idx].y, 0, 1) * h) for idx in right_eye_landmarks]
+            right_eye_bbox = [(min(right_eye_x), min(right_eye_y)), (max(right_eye_x), max(right_eye_y))]
+            right_eye_bbox = convert_bbox_to_square_bbox(right_eye_bbox, max_h, max_w, scale=eye_bbox_scale)
+            eye_bbox = {"left_eye": left_eye_bbox, "right_eye": right_eye_bbox}
+            all_eye_bounding_box.append(eye_bbox)
+            face_contour = np.zeros_like(image)
+            for landmark_id, landmark in enumerate(face_landmarks):
+                cx, cy = int(landmark.x * w), int(landmark.y * h)
+                if cy >= max_h or cx >= max_w: continue
+                if cy < 0 or cx < 0: continue
+                face_contour[cy, cx] = (255, 255, 255)
+            all_face_contour.append(face_contour)
+            if save_vis:
+                import cv2
+                x_min, y_min, x_max, y_max = mouth_bbox
+                cv2.rectangle(image, (x_min, y_min), (x_max, y_max), (0, 0, 255), 2)
+                for eye_key, bbox in eye_bbox.items():
+                    x_min, y_min, x_max, y_max = bbox
+                    color = (0, 0, 255)
+                    cv2.rectangle(image, (x_min, y_min), (x_max, y_max), color, 2)
+                for landmark_id, landmark in enumerate(face_landmarks):
+                    cx, cy = int(landmark.x * w), int(landmark.y * h)
+                    circle_size = 2
+                    if landmark_id in mouth_landmarks:
+                        cv2.circle(image, (cx, cy), circle_size, (0, 0, 255), -1)
+                    elif landmark_id in left_eye_landmarks+right_eye_landmarks:
+                        cv2.circle(image, (cx, cy), circle_size, (0, 255, 0), -1)
+                    else:
+                        cv2.circle(image, (cx, cy), circle_size, (255, 255, 255), -1)
+                cv2.imwrite('image_detect.png', image[:,:,::-1])
+                # import pdb; pdb.set_trace()
+        return (
+            all_x,
+            all_y,
+            all_orientation,
+            num_faces,
+            all_keypoints,
+            all_bounding_box,
+            all_mouth_bounding_box,
+            all_eye_bounding_box,
+            all_face_contour,
+            all_blendshapes,
+            all_facial_transformation_matrices,
+            annotated_image,
+        )

tools/visualization_0416/utils/face_landmarker.task ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:64184e229b263107bc2b804c6625db1341ff2bb731874b0bcc2fe6544e0bc9ff
+size 3758596