import mediapipe as mp from mediapipe import solutions from mediapipe.framework.formats import landmark_pb2 import numpy as np import cv2 def convert_bbox_to_square_bbox(bbox, max_h, max_w, scale=1.0): # Calculate width, height, and max_size of the bounding box width = bbox[1][0] - bbox[0][0] height = bbox[1][1] - bbox[0][1] max_size = max(width, height) * scale # Calculate center of the bounding box center_x = (bbox[0][0] + bbox[1][0]) / 2 center_y = (bbox[0][1] + bbox[1][1]) / 2 # Calculate the left-up and right-bottom corners of the square bounding box half_size = max_size / 2 left_top = [int(center_x - half_size), int(center_y - half_size)] right_bottom = [int(center_x + half_size), int(center_y + half_size)] # Ensure the square is within image bounds left_top[0] = max(0, left_top[0]) left_top[1] = max(0, left_top[1]) right_bottom[0] = min(max_w, right_bottom[0]) right_bottom[1] = min(max_h, right_bottom[1]) # Return the new bounding box as a list of top-left and bottom-right coordinates return [left_top[0], left_top[1], right_bottom[0], right_bottom[1]] def draw_landmarks_on_image(rgb_image, detection_result): face_landmarks_list = detection_result.face_landmarks annotated_image = np.copy(rgb_image) # Loop through the detected faces to visualize. for idx in range(len(face_landmarks_list)): face_landmarks = face_landmarks_list[idx] # Draw the face landmarks. face_landmarks_proto = landmark_pb2.NormalizedLandmarkList() face_landmarks_proto.landmark.extend( [ landmark_pb2.NormalizedLandmark( x=landmark.x, y=landmark.y, z=landmark.z ) for landmark in face_landmarks ] ) solutions.drawing_utils.draw_landmarks( image=annotated_image, landmark_list=face_landmarks_proto, connections=mp.solutions.face_mesh.FACEMESH_TESSELATION, landmark_drawing_spec=None, connection_drawing_spec=mp.solutions.drawing_styles.get_default_face_mesh_tesselation_style(), ) solutions.drawing_utils.draw_landmarks( image=annotated_image, landmark_list=face_landmarks_proto, connections=mp.solutions.face_mesh.FACEMESH_CONTOURS, landmark_drawing_spec=None, connection_drawing_spec=mp.solutions.drawing_styles.get_default_face_mesh_contours_style(), ) solutions.drawing_utils.draw_landmarks( image=annotated_image, landmark_list=face_landmarks_proto, connections=mp.solutions.face_mesh.FACEMESH_IRISES, landmark_drawing_spec=None, connection_drawing_spec=mp.solutions.drawing_styles.get_default_face_mesh_iris_connections_style(), ) return annotated_image class FaceDetector: def __init__(self, mediapipe_model_asset_path, delegate=1, face_detection_confidence=0.5, num_faces=5): # Create a face landmarker instance with the video mode: options = mp.tasks.vision.FaceLandmarkerOptions( base_options=mp.tasks.BaseOptions( model_asset_path=mediapipe_model_asset_path, # delegate=mp.tasks.BaseOptions.Delegate.GPU, # TODO: why does the gpu version not work in docker??? delegate=delegate, ), running_mode=mp.tasks.vision.RunningMode.IMAGE, num_faces=num_faces, output_face_blendshapes=True, output_facial_transformation_matrixes=True, min_face_detection_confidence=face_detection_confidence, min_face_presence_confidence=face_detection_confidence, min_tracking_confidence=face_detection_confidence, ) self.detector = mp.tasks.vision.FaceLandmarker.create_from_options(options) def get_one_face_xy_rotation_and_keypoints(self, image, mouth_bbox_scale = 1.2, eye_bbox_scale = 1.5, annotate_image: bool = False, save_vis=False): mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=image) # get facial rotation results = self.detector.detect(mp_image) max_h, max_w = image.shape[:2] if annotate_image: annotated_image = draw_landmarks_on_image(image, results) else: annotated_image = None all_x = [] all_y = [] all_orientation = [] all_keypoints = [] all_bounding_box = [] all_mouth_bounding_box = [] all_eye_bounding_box = [] all_face_contour = [] all_eyeball = [] all_eyeball_mask = [] all_blendshapes = [] all_mouth_p = [] all_nose_p = [] all_left_eye_p = [] all_right_eye_p = [] num_faces = len(results.face_landmarks) for face_blendshapes in results.face_blendshapes: blendshapes = [item.score for item in face_blendshapes] all_blendshapes.append(blendshapes) all_facial_transformation_matrices = results.facial_transformation_matrixes for face_landmarks in results.face_landmarks: keypoints = [] bounding_box = [] h, w = image.shape[0], image.shape[1] cx_min, cy_min = w, h cx_max, cy_max = 0, 0 for idx, lm in enumerate(face_landmarks): # Clip landmarks if they go off the image cx, cy = int(np.clip(lm.x, 0, 1) * w), int(np.clip(lm.y, 0, 1) * h) if cx < cx_min: cx_min = cx if cy < cy_min: cy_min = cy if cx > cx_max: cx_max = cx if cy > cy_max: cy_max = cy keypoints.append((lm.x, lm.y, lm.z)) if idx == 137: right_cheek = (lm.x, lm.y, lm.z) if idx == 366: left_cheek = (lm.x, lm.y, lm.z) if idx == 4: nose = (lm.x, lm.y, lm.z) # get vector from middle of face to tip of nose face_middle = ( (right_cheek[0] + left_cheek[0]) / 2.0, (right_cheek[1] + left_cheek[1]) / 2.0, ) x = nose[0] - face_middle[0] y = nose[1] - face_middle[1] if x > 0.15: orientation = "left" elif x < -0.15: orientation = "right" else: orientation = "forward" bounding_box = [(cx_min, cy_min), (cx_max, cy_max)] all_keypoints.append(keypoints) all_bounding_box.append(bounding_box) all_x.append(x) all_y.append(y) all_orientation.append(orientation) # Get mouth bounding box (landmarks 13-17 and 308-312) mouth_landmarks = [ 61, 146, 146, 91, 91, 181, 181, 84, 84, 17, 17, 314, 314, 405, 405, 321, 321, 375, 375, 291, 61, 185, 185, 40, 40, 39, 39, 37, 37, 0, 0, 267, 267, 269, 269, 270, 270, 409, 409, 291, 78, 95, 95, 88, 88, 178, 178, 87, 87, 14, 14, 317, 317, 402, 402, 318, 318, 324, 324, 308, 78, 191, 191, 80, 80, 81, 81, 82, 82, 13, 13, 312, 312, 311, 311, 310, 310, 415, 415, 308, ] # mouth_landmarks = [13, 14, 15, 16, 17, 308, 309, 310, 311, 312] mouth_x = [int(np.clip(face_landmarks[idx].x, 0, 1) * w) for idx in mouth_landmarks] mouth_y = [int(np.clip(face_landmarks[idx].y, 0, 1) * h) for idx in mouth_landmarks] mouth_bbox = [(min(mouth_x), min(mouth_y)), (max(mouth_x), max(mouth_y))] mouth_p = np.array([(mouth_bbox[0][0] + mouth_bbox[1][0]) / 2, (mouth_bbox[1][0] + mouth_bbox[1][1]) / 2]) mouth_bbox = convert_bbox_to_square_bbox(mouth_bbox, max_h, max_w, scale=mouth_bbox_scale) nose_landmarks = [48, 115, 220, 45, 4, 275, 440, 344, 278] nose_x = [int(np.clip(face_landmarks[idx].x, 0, 1) * w) for idx in nose_landmarks] nose_y = [int(np.clip(face_landmarks[idx].y, 0, 1) * h) for idx in nose_landmarks] nose_bbox = [(min(nose_x), min(nose_y)), (max(nose_x), max(nose_y))] nose_p = np.array([(nose_bbox[0][0] + nose_bbox[1][0]) / 2, (nose_bbox[1][0] + nose_bbox[1][1]) / 2]) # width = mouth_bbox[1][0] - mouth_bbox[0][0] # height = mouth_bbox[1][1] - mouth_bbox[0][1] # max_size = max(width, height) * 1.2 # center_x = (mouth_bbox[0][0] + mouth_bbox[1][0]) / 2 # center_y = (mouth_bbox[0][1] + mouth_bbox[1][1]) / 2 # left_up = (int(center_x - max_size/2), int(center_y - max_size/2)) # right_bottom = (int(center_x + max_size/2), int(center_y + max_size/2)) # mouth_bbox = [left_up, right_bottom] all_mouth_bounding_box.append(mouth_bbox) # Get eye bounding boxes (left eye: landmarks 33-133, right eye: landmarks 362-263) left_eye_landmarks = [362, 398, 384, 385, 386, 387, 388, 466, 263, 249, 390, 373, 374, 380, 381, 382] right_eye_landmarks = [33, 246, 161, 160, 159, 158, 157, 173, 133, 155, 154, 153, 145, 144, 163, 7] left_eye_x = [int(np.clip(face_landmarks[idx].x, 0, 1) * w) for idx in left_eye_landmarks] left_eye_y = [int(np.clip(face_landmarks[idx].y, 0, 1) * h) for idx in left_eye_landmarks] left_eye_bbox = [(min(left_eye_x), min(left_eye_y)), (max(left_eye_x), max(left_eye_y))] left_size = max(left_eye_y) - min(left_eye_y) left_eye_p = np.array([(left_eye_bbox[0][0] + left_eye_bbox[1][0]) / 2, (left_eye_bbox[1][0] + left_eye_bbox[1][1]) / 2]) left_eye_bbox = convert_bbox_to_square_bbox(left_eye_bbox, max_h, max_w, scale=eye_bbox_scale) right_eye_x = [int(np.clip(face_landmarks[idx].x, 0, 1) * w) for idx in right_eye_landmarks] right_eye_y = [int(np.clip(face_landmarks[idx].y, 0, 1) * h) for idx in right_eye_landmarks] right_eye_bbox = [(min(right_eye_x), min(right_eye_y)), (max(right_eye_x), max(right_eye_y))] right_size = max(right_eye_y) - min(right_eye_y) right_eye_p = np.array([(right_eye_bbox[0][0] + right_eye_bbox[1][0]) / 2, (right_eye_bbox[1][0] + right_eye_bbox[1][1]) / 2]) right_eye_bbox = convert_bbox_to_square_bbox(right_eye_bbox, max_h, max_w, scale=eye_bbox_scale) eye_bbox = {"left_eye": left_eye_bbox, "right_eye": right_eye_bbox} all_eye_bounding_box.append(eye_bbox) face_contour = np.zeros_like(image) for landmark_id, landmark in enumerate(face_landmarks): cx, cy = int(landmark.x * w), int(landmark.y * h) if cy >= max_h or cx >= max_w: continue if cy < 0 or cx < 0: continue face_contour[cy, cx] = (255, 255, 255) eyeball = np.zeros_like(image) for landmark_id, landmark in enumerate(face_landmarks): cx, cy = int(landmark.x * w), int(landmark.y * h) if landmark_id not in [468, 473]: continue if cy >= max_h or cx >= max_w: continue if cy < 0 or cx < 0: continue radius = int(left_size // 3) if landmark_id == 468 else int(right_size // 3) cv2.circle(eyeball, (cx, cy), radius=radius, color=(255, 0, 0), thickness=-1) eyeball_mask = (eyeball.sum(axis=2) != 0)[:, :, None] all_eyeball.append(eyeball) all_eyeball_mask.append(eyeball_mask) all_face_contour.append(face_contour) all_mouth_p.append(mouth_p) all_nose_p.append(nose_p) all_left_eye_p.append(left_eye_p) all_right_eye_p.append(right_eye_p) if save_vis: x_min, y_min, x_max, y_max = mouth_bbox cv2.rectangle(image, (x_min, y_min), (x_max, y_max), (0, 0, 255), 2) for eye_key, bbox in eye_bbox.items(): x_min, y_min, x_max, y_max = bbox color = (0, 0, 255) cv2.rectangle(image, (x_min, y_min), (x_max, y_max), color, 2) for landmark_id, landmark in enumerate(face_landmarks): cx, cy = int(landmark.x * w), int(landmark.y * h) circle_size = 2 if landmark_id in mouth_landmarks: cv2.circle(image, (cx, cy), circle_size, (0, 0, 255), -1) elif landmark_id in left_eye_landmarks+right_eye_landmarks: cv2.circle(image, (cx, cy), circle_size, (0, 255, 0), -1) else: cv2.circle(image, (cx, cy), circle_size, (255, 255, 255), -1) cv2.imwrite('image_detect.png', image[:,:,::-1]) # import pdb; pdb.set_trace() return ( all_x, all_y, all_orientation, num_faces, all_keypoints, all_bounding_box, all_mouth_bounding_box, all_eye_bounding_box, all_face_contour, all_blendshapes, all_facial_transformation_matrices, annotated_image, all_mouth_p, # 12 all_nose_p, # 13 all_left_eye_p, # 14 all_right_eye_p, # 15 all_eyeball, # 16 all_eyeball_mask, # 17 ) def get_face_xy_rotation_and_keypoints(self, image, mouth_bbox_scale = 1.2, eye_bbox_scale = 1.5, annotate_image: bool = False, save_vis=False): mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=image) # get facial rotation results = self.detector.detect(mp_image) max_h, max_w = image.shape[:2] if annotate_image: annotated_image = draw_landmarks_on_image(image, results) else: annotated_image = None all_x = [] all_y = [] all_orientation = [] all_keypoints = [] all_bounding_box = [] all_mouth_bounding_box = [] all_eye_bounding_box = [] all_face_contour = [] all_blendshapes = [] num_faces = len(results.face_landmarks) for face_blendshapes in results.face_blendshapes: blendshapes = [item.score for item in face_blendshapes] all_blendshapes.append(blendshapes) all_facial_transformation_matrices = results.facial_transformation_matrixes for face_landmarks in results.face_landmarks: keypoints = [] bounding_box = [] h, w = image.shape[0], image.shape[1] cx_min, cy_min = w, h cx_max, cy_max = 0, 0 for idx, lm in enumerate(face_landmarks): # Clip landmarks if they go off the image cx, cy = int(np.clip(lm.x, 0, 1) * w), int(np.clip(lm.y, 0, 1) * h) if cx < cx_min: cx_min = cx if cy < cy_min: cy_min = cy if cx > cx_max: cx_max = cx if cy > cy_max: cy_max = cy keypoints.append((lm.x, lm.y, lm.z)) if idx == 137: right_cheek = (lm.x, lm.y, lm.z) if idx == 366: left_cheek = (lm.x, lm.y, lm.z) if idx == 4: nose = (lm.x, lm.y, lm.z) # get vector from middle of face to tip of nose face_middle = ( (right_cheek[0] + left_cheek[0]) / 2.0, (right_cheek[1] + left_cheek[1]) / 2.0, ) x = nose[0] - face_middle[0] y = nose[1] - face_middle[1] if x > 0.15: orientation = "left" elif x < -0.15: orientation = "right" else: orientation = "forward" bounding_box = [(cx_min, cy_min), (cx_max, cy_max)] all_keypoints.append(keypoints) all_bounding_box.append(bounding_box) all_x.append(x) all_y.append(y) all_orientation.append(orientation) # Get mouth bounding box (landmarks 13-17 and 308-312) mouth_landmarks = [ 61, 146, 146, 91, 91, 181, 181, 84, 84, 17, 17, 314, 314, 405, 405, 321, 321, 375, 375, 291, 61, 185, 185, 40, 40, 39, 39, 37, 37, 0, 0, 267, 267, 269, 269, 270, 270, 409, 409, 291, 78, 95, 95, 88, 88, 178, 178, 87, 87, 14, 14, 317, 317, 402, 402, 318, 318, 324, 324, 308, 78, 191, 191, 80, 80, 81, 81, 82, 82, 13, 13, 312, 312, 311, 311, 310, 310, 415, 415, 308, ] # mouth_landmarks = [13, 14, 15, 16, 17, 308, 309, 310, 311, 312] mouth_x = [int(np.clip(face_landmarks[idx].x, 0, 1) * w) for idx in mouth_landmarks] mouth_y = [int(np.clip(face_landmarks[idx].y, 0, 1) * h) for idx in mouth_landmarks] mouth_bbox = [(min(mouth_x), min(mouth_y)), (max(mouth_x), max(mouth_y))] mouth_bbox = convert_bbox_to_square_bbox(mouth_bbox, max_h, max_w, scale=mouth_bbox_scale) # width = mouth_bbox[1][0] - mouth_bbox[0][0] # height = mouth_bbox[1][1] - mouth_bbox[0][1] # max_size = max(width, height) * 1.2 # center_x = (mouth_bbox[0][0] + mouth_bbox[1][0]) / 2 # center_y = (mouth_bbox[0][1] + mouth_bbox[1][1]) / 2 # left_up = (int(center_x - max_size/2), int(center_y - max_size/2)) # right_bottom = (int(center_x + max_size/2), int(center_y + max_size/2)) # mouth_bbox = [left_up, right_bottom] all_mouth_bounding_box.append(mouth_bbox) # Get eye bounding boxes (left eye: landmarks 33-133, right eye: landmarks 362-263) left_eye_landmarks = [362, 398, 384, 385, 386, 387, 388, 466, 263, 249, 390, 373, 374, 380, 381, 382] right_eye_landmarks = [33, 246, 161, 160, 159, 158, 157, 173, 133, 155, 154, 153, 145, 144, 163, 7] left_eye_x = [int(np.clip(face_landmarks[idx].x, 0, 1) * w) for idx in left_eye_landmarks] left_eye_y = [int(np.clip(face_landmarks[idx].y, 0, 1) * h) for idx in left_eye_landmarks] left_eye_bbox = [(min(left_eye_x), min(left_eye_y)), (max(left_eye_x), max(left_eye_y))] left_eye_bbox = convert_bbox_to_square_bbox(left_eye_bbox, max_h, max_w, scale=eye_bbox_scale) right_eye_x = [int(np.clip(face_landmarks[idx].x, 0, 1) * w) for idx in right_eye_landmarks] right_eye_y = [int(np.clip(face_landmarks[idx].y, 0, 1) * h) for idx in right_eye_landmarks] right_eye_bbox = [(min(right_eye_x), min(right_eye_y)), (max(right_eye_x), max(right_eye_y))] right_eye_bbox = convert_bbox_to_square_bbox(right_eye_bbox, max_h, max_w, scale=eye_bbox_scale) eye_bbox = {"left_eye": left_eye_bbox, "right_eye": right_eye_bbox} all_eye_bounding_box.append(eye_bbox) face_contour = np.zeros_like(image) for landmark_id, landmark in enumerate(face_landmarks): cx, cy = int(landmark.x * w), int(landmark.y * h) if cy >= max_h or cx >= max_w: continue if cy < 0 or cx < 0: continue face_contour[cy, cx] = (255, 255, 255) all_face_contour.append(face_contour) if save_vis: import cv2 x_min, y_min, x_max, y_max = mouth_bbox cv2.rectangle(image, (x_min, y_min), (x_max, y_max), (0, 0, 255), 2) for eye_key, bbox in eye_bbox.items(): x_min, y_min, x_max, y_max = bbox color = (0, 0, 255) cv2.rectangle(image, (x_min, y_min), (x_max, y_max), color, 2) for landmark_id, landmark in enumerate(face_landmarks): cx, cy = int(landmark.x * w), int(landmark.y * h) circle_size = 2 if landmark_id in mouth_landmarks: cv2.circle(image, (cx, cy), circle_size, (0, 0, 255), -1) elif landmark_id in left_eye_landmarks+right_eye_landmarks: cv2.circle(image, (cx, cy), circle_size, (0, 255, 0), -1) else: cv2.circle(image, (cx, cy), circle_size, (255, 255, 255), -1) cv2.imwrite('image_detect.png', image[:,:,::-1]) # import pdb; pdb.set_trace() return ( all_x, all_y, all_orientation, num_faces, all_keypoints, all_bounding_box, all_mouth_bounding_box, all_eye_bounding_box, all_face_contour, all_blendshapes, all_facial_transformation_matrices, annotated_image, )