import argparse import math import os from enum import IntEnum from pathlib import Path import cv2 import face_alignment import numpy as np import numpy.linalg as npla from PIL import Image from diffusers.utils import load_image landmarks_2D_new = np.array( [ [0.000213256, 0.106454], # 17 [0.0752622, 0.038915], # 18 [0.18113, 0.0187482], # 19 [0.29077, 0.0344891], # 20 [0.393397, 0.0773906], # 21 [0.586856, 0.0773906], # 22 [0.689483, 0.0344891], # 23 [0.799124, 0.0187482], # 24 [0.904991, 0.038915], # 25 [0.98004, 0.106454], # 26 [0.490127, 0.203352], # 27 [0.490127, 0.307009], # 28 [0.490127, 0.409805], # 29 [0.490127, 0.515625], # 30 [0.36688, 0.587326], # 31 [0.426036, 0.609345], # 32 [0.490127, 0.628106], # 33 [0.554217, 0.609345], # 34 [0.613373, 0.587326], # 35 [0.121737, 0.216423], # 36 [0.187122, 0.178758], # 37 [0.265825, 0.179852], # 38 [0.334606, 0.231733], # 39 [0.260918, 0.245099], # 40 [0.182743, 0.244077], # 41 [0.645647, 0.231733], # 42 [0.714428, 0.179852], # 43 [0.793132, 0.178758], # 44 [0.858516, 0.216423], # 45 [0.79751, 0.244077], # 46 [0.719335, 0.245099], # 47 [0.254149, 0.780233], # 48 [0.726104, 0.780233], # 54 ], dtype=np.float32, ) class FaceType(IntEnum): # enumerating in order "next contains prev" HALF = 0 MID_FULL = 1 FULL = 2 FULL_NO_ALIGN = 3 WHOLE_FACE = 4 WHOLE_FACE_NO_ALIGN = 5 HEAD = 10 HEAD_NO_ALIGN = 20 MARK_ONLY = (100,) # no align at all, just embedded faceinfo @staticmethod def fromString(s): r = from_string_dict.get(s.lower()) if r is None: raise Exception("FaceType.fromString value error") return r @staticmethod def toString(face_type): return to_string_dict[face_type] to_string_dict = { FaceType.HALF: "half_face", FaceType.MID_FULL: "midfull_face", FaceType.FULL: "full_face", FaceType.FULL_NO_ALIGN: "full_face_no_align", FaceType.WHOLE_FACE: "whole_face", FaceType.WHOLE_FACE_NO_ALIGN: "whole_face_no_align", FaceType.HEAD: "head", FaceType.HEAD_NO_ALIGN: "head_no_align", FaceType.MARK_ONLY: "mark_only", } from_string_dict = {to_string_dict[x]: x for x in to_string_dict.keys()} FaceType_to_padding_remove_align = { FaceType.HALF: (0.0, False), FaceType.MID_FULL: (0.0675, False), FaceType.FULL: (0.2109375, False), FaceType.FULL_NO_ALIGN: (0.2109375, True), FaceType.WHOLE_FACE: (0.40, False), FaceType.WHOLE_FACE_NO_ALIGN: (0.40, True), FaceType.HEAD: (0.70, False), FaceType.HEAD_NO_ALIGN: (0.70, True), } def umeyama(src, dst, estimate_scale): """Estimate N-D similarity transformation with or without scaling. Parameters ---------- src : (M, N) array Source coordinates. dst : (M, N) array Destination coordinates. estimate_scale : bool Whether to estimate scaling factor. Returns ------- T : (N + 1, N + 1) The homogeneous similarity transformation matrix. The matrix contains NaN values only if the problem is not well-conditioned. References ---------- .. [1] "Least-squares estimation of transformation parameters between two point patterns", Shinji Umeyama, PAMI 1991, DOI: 10.1109/34.88573 """ num = src.shape[0] dim = src.shape[1] # Compute mean of src and dst. src_mean = src.mean(axis=0) dst_mean = dst.mean(axis=0) # Subtract mean from src and dst. src_demean = src - src_mean dst_demean = dst - dst_mean # Eq. (38). A = np.dot(dst_demean.T, src_demean) / num # Eq. (39). d = np.ones((dim,), dtype=np.double) if np.linalg.det(A) < 0: d[dim - 1] = -1 T = np.eye(dim + 1, dtype=np.double) U, S, V = np.linalg.svd(A) # Eq. (40) and (43). rank = np.linalg.matrix_rank(A) if rank == 0: return np.nan * T elif rank == dim - 1: if np.linalg.det(U) * np.linalg.det(V) > 0: T[:dim, :dim] = np.dot(U, V) else: s = d[dim - 1] d[dim - 1] = -1 T[:dim, :dim] = np.dot(U, np.dot(np.diag(d), V)) d[dim - 1] = s else: T[:dim, :dim] = np.dot(U, np.dot(np.diag(d), V)) if estimate_scale: # Eq. (41) and (42). scale = 1.0 / src_demean.var(axis=0).sum() * np.dot(S, d) else: scale = 1.0 T[:dim, dim] = dst_mean - scale * np.dot(T[:dim, :dim], src_mean.T) T[:dim, :dim] *= scale return T def transform_points(points, mat, invert=False): if invert: mat = cv2.invertAffineTransform(mat) points = np.expand_dims(points, axis=1) points = cv2.transform(points, mat, points.shape) points = np.squeeze(points) return points def estimate_averaged_yaw(landmarks): # Works much better than solvePnP if landmarks from "3DFAN" if not isinstance(landmarks, np.ndarray): landmarks = np.array(landmarks) l = ( (landmarks[27][0] - landmarks[0][0]) + (landmarks[28][0] - landmarks[1][0]) + (landmarks[29][0] - landmarks[2][0]) ) / 3.0 r = ( (landmarks[16][0] - landmarks[27][0]) + (landmarks[15][0] - landmarks[28][0]) + (landmarks[14][0] - landmarks[29][0]) ) / 3.0 return float(r - l) def polygon_area(x, y): return 0.5 * np.abs(np.dot(x, np.roll(y, 1)) - np.dot(y, np.roll(x, 1))) def get_transform_mat(image_landmarks, output_size, face_type, scale=1.0): if not isinstance(image_landmarks, np.ndarray): image_landmarks = np.array(image_landmarks) # estimate landmarks transform from global space to local aligned space with bounds [0..1] mat = umeyama( np.concatenate([image_landmarks[17:49], image_landmarks[54:55]]), landmarks_2D_new, True, )[0:2] # get corner points in global space g_p = transform_points( np.float32([(0, 0), (1, 0), (1, 1), (0, 1), (0.5, 0.5)]), mat, True ) g_c = g_p[4] # calc diagonal vectors between corners in global space tb_diag_vec = (g_p[2] - g_p[0]).astype(np.float32) tb_diag_vec /= npla.norm(tb_diag_vec) bt_diag_vec = (g_p[1] - g_p[3]).astype(np.float32) bt_diag_vec /= npla.norm(bt_diag_vec) # calc modifier of diagonal vectors for scale and padding value padding, remove_align = FaceType_to_padding_remove_align.get(face_type, 0.0) mod = (1.0 / scale) * (npla.norm(g_p[0] - g_p[2]) * (padding * np.sqrt(2.0) + 0.5)) if face_type == FaceType.WHOLE_FACE: # adjust vertical offset for WHOLE_FACE, 7% below in order to cover more forehead vec = (g_p[0] - g_p[3]).astype(np.float32) vec_len = npla.norm(vec) vec /= vec_len g_c += vec * vec_len * 0.07 elif face_type == FaceType.HEAD: # assuming image_landmarks are 3D_Landmarks extracted for HEAD, # adjust horizontal offset according to estimated yaw yaw = estimate_averaged_yaw(transform_points(image_landmarks, mat, False)) hvec = (g_p[0] - g_p[1]).astype(np.float32) hvec_len = npla.norm(hvec) hvec /= hvec_len yaw *= np.abs(math.tanh(yaw * 2)) # Damp near zero g_c -= hvec * (yaw * hvec_len / 2.0) # adjust vertical offset for HEAD, 50% below vvec = (g_p[0] - g_p[3]).astype(np.float32) vvec_len = npla.norm(vvec) vvec /= vvec_len g_c += vvec * vvec_len * 0.50 # calc 3 points in global space to estimate 2d affine transform if not remove_align: l_t = np.array( [g_c - tb_diag_vec * mod, g_c + bt_diag_vec * mod, g_c + tb_diag_vec * mod] ) else: # remove_align - face will be centered in the frame but not aligned l_t = np.array( [ g_c - tb_diag_vec * mod, g_c + bt_diag_vec * mod, g_c + tb_diag_vec * mod, g_c - bt_diag_vec * mod, ] ) # get area of face square in global space area = polygon_area(l_t[:, 0], l_t[:, 1]) # calc side of square side = np.float32(math.sqrt(area) / 2) # calc 3 points with unrotated square l_t = np.array([g_c + [-side, -side], g_c + [side, -side], g_c + [side, side]]) # calc affine transform from 3 global space points to 3 local space points size of 'output_size' pts2 = np.float32(((0, 0), (output_size, 0), (output_size, output_size))) l_t = l_t.astype(np.float32) mat = cv2.getAffineTransform(l_t, pts2) return mat def extract_faces(model, image, face_image_size, face_type=FaceType.WHOLE_FACE): # take the first three channels (R, G, B) array = np.array(image)[:, :, :3] preds = model.get_landmarks(array) face_images = [] image_to_face_matrices = [] for face_landmarks in preds: image_to_face_mat = get_transform_mat( face_landmarks, face_image_size, face_type ) face_array = cv2.warpAffine( array, image_to_face_mat, (face_image_size, face_image_size), cv2.INTER_LANCZOS4, borderValue=(255, 255, 255), ) image_to_face_matrices.append(image_to_face_mat) face_images.append(Image.fromarray(face_array)) return face_images, image_to_face_matrices def parse_args(): parser = argparse.ArgumentParser() parser.add_argument( "--image_path", type=str, default="", help="Path to input image", ) parser.add_argument( "--image_size", type=int, default=256, help="Output image size", ) parser.add_argument( "--output_folder", type=str, default="./face_images", help="Path to output folder", ) parser.add_argument( "--face_type", type=str, default="whole_face", help=( "Face type to extract (e.g., half_face, midfull_face, full_face, " "full_face_no_align, whole_face, whole_face_no_align, head, " "head_no_align, mark_only.)" ), ) args = parser.parse_args() # Convert face_type string to FaceType enum try: args.face_type = FaceType.fromString(args.face_type) except Exception: raise ValueError(f"Invalid face_type: {args.face_type}") return args if __name__ == "__main__": args = parse_args() # sfd for SFD, dlib for Dlib and folder for existing bounding boxes. fa = face_alignment.FaceAlignment( face_alignment.LandmarksType.TWO_D, face_detector="sfd" ) pil_image = load_image(args.image_path) face_images, image_to_face_matrices = extract_faces( fa, pil_image, args.image_size, args.face_type ) # Make sure the output folder exists os.makedirs(args.output_folder, exist_ok=True) input_filename = Path(args.image_path).stem for i, face_image in enumerate(face_images): face_image.save(Path(args.output_folder, f"{input_filename}_{i:02}.png"))