DyStream / tools /visualization_0416 /utils /face_detector.py

upload ckpt

872b1a7 6 days ago

23.9 kB

	import mediapipe as mp
	from mediapipe import solutions
	from mediapipe.framework.formats import landmark_pb2
	import numpy as np
	import cv2


	def convert_bbox_to_square_bbox(bbox, max_h, max_w, scale=1.0):
	# Calculate width, height, and max_size of the bounding box
	width = bbox[1][0] - bbox[0][0]
	height = bbox[1][1] - bbox[0][1]
	max_size = max(width, height) * scale

	# Calculate center of the bounding box
	center_x = (bbox[0][0] + bbox[1][0]) / 2
	center_y = (bbox[0][1] + bbox[1][1]) / 2

	# Calculate the left-up and right-bottom corners of the square bounding box
	half_size = max_size / 2
	left_top = [int(center_x - half_size), int(center_y - half_size)]
	right_bottom = [int(center_x + half_size), int(center_y + half_size)]

	# Ensure the square is within image bounds
	left_top[0] = max(0, left_top[0])
	left_top[1] = max(0, left_top[1])
	right_bottom[0] = min(max_w, right_bottom[0])
	right_bottom[1] = min(max_h, right_bottom[1])

	# Return the new bounding box as a list of top-left and bottom-right coordinates
	return [left_top[0], left_top[1], right_bottom[0], right_bottom[1]]


	def draw_landmarks_on_image(rgb_image, detection_result):
	face_landmarks_list = detection_result.face_landmarks
	annotated_image = np.copy(rgb_image)

	# Loop through the detected faces to visualize.
	for idx in range(len(face_landmarks_list)):
	face_landmarks = face_landmarks_list[idx]

	# Draw the face landmarks.
	face_landmarks_proto = landmark_pb2.NormalizedLandmarkList()
	face_landmarks_proto.landmark.extend(
	[
	landmark_pb2.NormalizedLandmark(
	x=landmark.x, y=landmark.y, z=landmark.z
	)
	for landmark in face_landmarks
	]
	)

	solutions.drawing_utils.draw_landmarks(
	image=annotated_image,
	landmark_list=face_landmarks_proto,
	connections=mp.solutions.face_mesh.FACEMESH_TESSELATION,
	landmark_drawing_spec=None,
	connection_drawing_spec=mp.solutions.drawing_styles.get_default_face_mesh_tesselation_style(),
	)
	solutions.drawing_utils.draw_landmarks(
	image=annotated_image,
	landmark_list=face_landmarks_proto,
	connections=mp.solutions.face_mesh.FACEMESH_CONTOURS,
	landmark_drawing_spec=None,
	connection_drawing_spec=mp.solutions.drawing_styles.get_default_face_mesh_contours_style(),
	)
	solutions.drawing_utils.draw_landmarks(
	image=annotated_image,
	landmark_list=face_landmarks_proto,
	connections=mp.solutions.face_mesh.FACEMESH_IRISES,
	landmark_drawing_spec=None,
	connection_drawing_spec=mp.solutions.drawing_styles.get_default_face_mesh_iris_connections_style(),
	)

	return annotated_image


	class FaceDetector:
	def __init__(self, mediapipe_model_asset_path, delegate=1, face_detection_confidence=0.5, num_faces=5):
	# Create a face landmarker instance with the video mode:
	options = mp.tasks.vision.FaceLandmarkerOptions(
	base_options=mp.tasks.BaseOptions(
	model_asset_path=mediapipe_model_asset_path,
	# delegate=mp.tasks.BaseOptions.Delegate.GPU,
	# TODO: why does the gpu version not work in docker???
	delegate=delegate,
	),
	running_mode=mp.tasks.vision.RunningMode.IMAGE,
	num_faces=num_faces,
	output_face_blendshapes=True,
	output_facial_transformation_matrixes=True,
	min_face_detection_confidence=face_detection_confidence,
	min_face_presence_confidence=face_detection_confidence,
	min_tracking_confidence=face_detection_confidence,
	)
	self.detector = mp.tasks.vision.FaceLandmarker.create_from_options(options)

	def get_one_face_xy_rotation_and_keypoints(self, image, mouth_bbox_scale = 1.2, eye_bbox_scale = 1.5, annotate_image: bool = False, save_vis=False):
	mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=image)

	# get facial rotation
	results = self.detector.detect(mp_image)
	max_h, max_w = image.shape[:2]

	if annotate_image:
	annotated_image = draw_landmarks_on_image(image, results)
	else:
	annotated_image = None

	all_x = []
	all_y = []
	all_orientation = []
	all_keypoints = []
	all_bounding_box = []
	all_mouth_bounding_box = []
	all_eye_bounding_box = []
	all_face_contour = []
	all_eyeball = []
	all_eyeball_mask = []
	all_blendshapes = []
	all_mouth_p = []
	all_nose_p = []
	all_left_eye_p = []
	all_right_eye_p = []
	num_faces = len(results.face_landmarks)

	for face_blendshapes in results.face_blendshapes:
	blendshapes = [item.score for item in face_blendshapes]
	all_blendshapes.append(blendshapes)

	all_facial_transformation_matrices = results.facial_transformation_matrixes

	for face_landmarks in results.face_landmarks:
	keypoints = []
	bounding_box = []

	h, w = image.shape[0], image.shape[1]
	cx_min, cy_min = w, h
	cx_max, cy_max = 0, 0
	for idx, lm in enumerate(face_landmarks):
	# Clip landmarks if they go off the image
	cx, cy = int(np.clip(lm.x, 0, 1) * w), int(np.clip(lm.y, 0, 1) * h)

	if cx < cx_min:
	cx_min = cx
	if cy < cy_min:
	cy_min = cy
	if cx > cx_max:
	cx_max = cx
	if cy > cy_max:
	cy_max = cy

	keypoints.append((lm.x, lm.y, lm.z))

	if idx == 137:
	right_cheek = (lm.x, lm.y, lm.z)
	if idx == 366:
	left_cheek = (lm.x, lm.y, lm.z)
	if idx == 4:
	nose = (lm.x, lm.y, lm.z)

	# get vector from middle of face to tip of nose
	face_middle = (
	(right_cheek[0] + left_cheek[0]) / 2.0,
	(right_cheek[1] + left_cheek[1]) / 2.0,
	)

	x = nose[0] - face_middle[0]
	y = nose[1] - face_middle[1]

	if x > 0.15:
	orientation = "left"
	elif x < -0.15:
	orientation = "right"
	else:
	orientation = "forward"

	bounding_box = [(cx_min, cy_min), (cx_max, cy_max)]

	all_keypoints.append(keypoints)
	all_bounding_box.append(bounding_box)
	all_x.append(x)
	all_y.append(y)
	all_orientation.append(orientation)

	# Get mouth bounding box (landmarks 13-17 and 308-312)
	mouth_landmarks = [
	61,
	146,
	146,
	91,
	91,
	181,
	181,
	84,
	84,
	17,
	17,
	314,
	314,
	405,
	405,
	321,
	321,
	375,
	375,
	291,
	61,
	185,
	185,
	40,
	40,
	39,
	39,
	37,
	37,
	0,
	0,
	267,
	267,
	269,
	269,
	270,
	270,
	409,
	409,
	291,
	78,
	95,
	95,
	88,
	88,
	178,
	178,
	87,
	87,
	14,
	14,
	317,
	317,
	402,
	402,
	318,
	318,
	324,
	324,
	308,
	78,
	191,
	191,
	80,
	80,
	81,
	81,
	82,
	82,
	13,
	13,
	312,
	312,
	311,
	311,
	310,
	310,
	415,
	415,
	308,
	]
	# mouth_landmarks = [13, 14, 15, 16, 17, 308, 309, 310, 311, 312]
	mouth_x = [int(np.clip(face_landmarks[idx].x, 0, 1) * w) for idx in mouth_landmarks]
	mouth_y = [int(np.clip(face_landmarks[idx].y, 0, 1) * h) for idx in mouth_landmarks]
	mouth_bbox = [(min(mouth_x), min(mouth_y)), (max(mouth_x), max(mouth_y))]
	mouth_p = np.array([(mouth_bbox[0][0] + mouth_bbox[1][0]) / 2, (mouth_bbox[1][0] + mouth_bbox[1][1]) / 2])
	mouth_bbox = convert_bbox_to_square_bbox(mouth_bbox, max_h, max_w, scale=mouth_bbox_scale)

	nose_landmarks = [48, 115, 220, 45, 4, 275, 440, 344, 278]
	nose_x = [int(np.clip(face_landmarks[idx].x, 0, 1) * w) for idx in nose_landmarks]
	nose_y = [int(np.clip(face_landmarks[idx].y, 0, 1) * h) for idx in nose_landmarks]
	nose_bbox = [(min(nose_x), min(nose_y)), (max(nose_x), max(nose_y))]
	nose_p = np.array([(nose_bbox[0][0] + nose_bbox[1][0]) / 2, (nose_bbox[1][0] + nose_bbox[1][1]) / 2])

	# width = mouth_bbox[1][0] - mouth_bbox[0][0]
	# height = mouth_bbox[1][1] - mouth_bbox[0][1]
	# max_size = max(width, height) * 1.2
	# center_x = (mouth_bbox[0][0] + mouth_bbox[1][0]) / 2
	# center_y = (mouth_bbox[0][1] + mouth_bbox[1][1]) / 2
	# left_up = (int(center_x - max_size/2), int(center_y - max_size/2))
	# right_bottom = (int(center_x + max_size/2), int(center_y + max_size/2))
	# mouth_bbox = [left_up, right_bottom]

	all_mouth_bounding_box.append(mouth_bbox)

	# Get eye bounding boxes (left eye: landmarks 33-133, right eye: landmarks 362-263)
	left_eye_landmarks = [362, 398, 384, 385, 386, 387, 388, 466, 263, 249, 390, 373, 374, 380, 381, 382]
	right_eye_landmarks = [33, 246, 161, 160, 159, 158, 157, 173, 133, 155, 154, 153, 145, 144, 163, 7]

	left_eye_x = [int(np.clip(face_landmarks[idx].x, 0, 1) * w) for idx in left_eye_landmarks]
	left_eye_y = [int(np.clip(face_landmarks[idx].y, 0, 1) * h) for idx in left_eye_landmarks]
	left_eye_bbox = [(min(left_eye_x), min(left_eye_y)), (max(left_eye_x), max(left_eye_y))]
	left_size = max(left_eye_y) - min(left_eye_y)
	left_eye_p = np.array([(left_eye_bbox[0][0] + left_eye_bbox[1][0]) / 2, (left_eye_bbox[1][0] + left_eye_bbox[1][1]) / 2])
	left_eye_bbox = convert_bbox_to_square_bbox(left_eye_bbox, max_h, max_w, scale=eye_bbox_scale)

	right_eye_x = [int(np.clip(face_landmarks[idx].x, 0, 1) * w) for idx in right_eye_landmarks]
	right_eye_y = [int(np.clip(face_landmarks[idx].y, 0, 1) * h) for idx in right_eye_landmarks]
	right_eye_bbox = [(min(right_eye_x), min(right_eye_y)), (max(right_eye_x), max(right_eye_y))]
	right_size = max(right_eye_y) - min(right_eye_y)
	right_eye_p = np.array([(right_eye_bbox[0][0] + right_eye_bbox[1][0]) / 2, (right_eye_bbox[1][0] + right_eye_bbox[1][1]) / 2])
	right_eye_bbox = convert_bbox_to_square_bbox(right_eye_bbox, max_h, max_w, scale=eye_bbox_scale)

	eye_bbox = {"left_eye": left_eye_bbox, "right_eye": right_eye_bbox}

	all_eye_bounding_box.append(eye_bbox)

	face_contour = np.zeros_like(image)
	for landmark_id, landmark in enumerate(face_landmarks):
	cx, cy = int(landmark.x * w), int(landmark.y * h)
	if cy >= max_h or cx >= max_w: continue
	if cy < 0 or cx < 0: continue
	face_contour[cy, cx] = (255, 255, 255)

	eyeball = np.zeros_like(image)
	for landmark_id, landmark in enumerate(face_landmarks):
	cx, cy = int(landmark.x * w), int(landmark.y * h)
	if landmark_id not in [468, 473]: continue
	if cy >= max_h or cx >= max_w: continue
	if cy < 0 or cx < 0: continue
	radius = int(left_size // 3) if landmark_id == 468 else int(right_size // 3)
	cv2.circle(eyeball, (cx, cy), radius=radius, color=(255, 0, 0), thickness=-1)
	eyeball_mask = (eyeball.sum(axis=2) != 0)[:, :, None]

	all_eyeball.append(eyeball)
	all_eyeball_mask.append(eyeball_mask)
	all_face_contour.append(face_contour)
	all_mouth_p.append(mouth_p)
	all_nose_p.append(nose_p)
	all_left_eye_p.append(left_eye_p)
	all_right_eye_p.append(right_eye_p)

	if save_vis:
	x_min, y_min, x_max, y_max = mouth_bbox
	cv2.rectangle(image, (x_min, y_min), (x_max, y_max), (0, 0, 255), 2)

	for eye_key, bbox in eye_bbox.items():
	x_min, y_min, x_max, y_max = bbox
	color = (0, 0, 255)
	cv2.rectangle(image, (x_min, y_min), (x_max, y_max), color, 2)

	for landmark_id, landmark in enumerate(face_landmarks):
	cx, cy = int(landmark.x * w), int(landmark.y * h)
	circle_size = 2
	if landmark_id in mouth_landmarks:
	cv2.circle(image, (cx, cy), circle_size, (0, 0, 255), -1)
	elif landmark_id in left_eye_landmarks+right_eye_landmarks:
	cv2.circle(image, (cx, cy), circle_size, (0, 255, 0), -1)
	else:
	cv2.circle(image, (cx, cy), circle_size, (255, 255, 255), -1)
	cv2.imwrite('image_detect.png', image[:,:,::-1])
	# import pdb; pdb.set_trace()

	return (
	all_x,
	all_y,
	all_orientation,
	num_faces,
	all_keypoints,
	all_bounding_box,
	all_mouth_bounding_box,
	all_eye_bounding_box,
	all_face_contour,
	all_blendshapes,
	all_facial_transformation_matrices,
	annotated_image,
	all_mouth_p, # 12
	all_nose_p, # 13
	all_left_eye_p, # 14
	all_right_eye_p, # 15
	all_eyeball, # 16
	all_eyeball_mask, # 17
	)

	def get_face_xy_rotation_and_keypoints(self, image, mouth_bbox_scale = 1.2, eye_bbox_scale = 1.5, annotate_image: bool = False, save_vis=False):
	mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=image)

	# get facial rotation
	results = self.detector.detect(mp_image)
	max_h, max_w = image.shape[:2]

	if annotate_image:
	annotated_image = draw_landmarks_on_image(image, results)
	else:
	annotated_image = None

	all_x = []
	all_y = []
	all_orientation = []
	all_keypoints = []
	all_bounding_box = []
	all_mouth_bounding_box = []
	all_eye_bounding_box = []
	all_face_contour = []
	all_blendshapes = []
	num_faces = len(results.face_landmarks)

	for face_blendshapes in results.face_blendshapes:
	blendshapes = [item.score for item in face_blendshapes]
	all_blendshapes.append(blendshapes)

	all_facial_transformation_matrices = results.facial_transformation_matrixes

	for face_landmarks in results.face_landmarks:
	keypoints = []
	bounding_box = []

	h, w = image.shape[0], image.shape[1]
	cx_min, cy_min = w, h
	cx_max, cy_max = 0, 0
	for idx, lm in enumerate(face_landmarks):
	# Clip landmarks if they go off the image
	cx, cy = int(np.clip(lm.x, 0, 1) * w), int(np.clip(lm.y, 0, 1) * h)

	if cx < cx_min:
	cx_min = cx
	if cy < cy_min:
	cy_min = cy
	if cx > cx_max:
	cx_max = cx
	if cy > cy_max:
	cy_max = cy

	keypoints.append((lm.x, lm.y, lm.z))

	if idx == 137:
	right_cheek = (lm.x, lm.y, lm.z)
	if idx == 366:
	left_cheek = (lm.x, lm.y, lm.z)
	if idx == 4:
	nose = (lm.x, lm.y, lm.z)

	# get vector from middle of face to tip of nose
	face_middle = (
	(right_cheek[0] + left_cheek[0]) / 2.0,
	(right_cheek[1] + left_cheek[1]) / 2.0,
	)

	x = nose[0] - face_middle[0]
	y = nose[1] - face_middle[1]

	if x > 0.15:
	orientation = "left"
	elif x < -0.15:
	orientation = "right"
	else:
	orientation = "forward"

	bounding_box = [(cx_min, cy_min), (cx_max, cy_max)]

	all_keypoints.append(keypoints)
	all_bounding_box.append(bounding_box)
	all_x.append(x)
	all_y.append(y)
	all_orientation.append(orientation)

	# Get mouth bounding box (landmarks 13-17 and 308-312)
	mouth_landmarks = [
	61,
	146,
	146,
	91,
	91,
	181,
	181,
	84,
	84,
	17,
	17,
	314,
	314,
	405,
	405,
	321,
	321,
	375,
	375,
	291,
	61,
	185,
	185,
	40,
	40,
	39,
	39,
	37,
	37,
	0,
	0,
	267,
	267,
	269,
	269,
	270,
	270,
	409,
	409,
	291,
	78,
	95,
	95,
	88,
	88,
	178,
	178,
	87,
	87,
	14,
	14,
	317,
	317,
	402,
	402,
	318,
	318,
	324,
	324,
	308,
	78,
	191,
	191,
	80,
	80,
	81,
	81,
	82,
	82,
	13,
	13,
	312,
	312,
	311,
	311,
	310,
	310,
	415,
	415,
	308,
	]
	# mouth_landmarks = [13, 14, 15, 16, 17, 308, 309, 310, 311, 312]
	mouth_x = [int(np.clip(face_landmarks[idx].x, 0, 1) * w) for idx in mouth_landmarks]
	mouth_y = [int(np.clip(face_landmarks[idx].y, 0, 1) * h) for idx in mouth_landmarks]
	mouth_bbox = [(min(mouth_x), min(mouth_y)), (max(mouth_x), max(mouth_y))]
	mouth_bbox = convert_bbox_to_square_bbox(mouth_bbox, max_h, max_w, scale=mouth_bbox_scale)

	# width = mouth_bbox[1][0] - mouth_bbox[0][0]
	# height = mouth_bbox[1][1] - mouth_bbox[0][1]
	# max_size = max(width, height) * 1.2
	# center_x = (mouth_bbox[0][0] + mouth_bbox[1][0]) / 2
	# center_y = (mouth_bbox[0][1] + mouth_bbox[1][1]) / 2
	# left_up = (int(center_x - max_size/2), int(center_y - max_size/2))
	# right_bottom = (int(center_x + max_size/2), int(center_y + max_size/2))
	# mouth_bbox = [left_up, right_bottom]

	all_mouth_bounding_box.append(mouth_bbox)

	# Get eye bounding boxes (left eye: landmarks 33-133, right eye: landmarks 362-263)
	left_eye_landmarks = [362, 398, 384, 385, 386, 387, 388, 466, 263, 249, 390, 373, 374, 380, 381, 382]
	right_eye_landmarks = [33, 246, 161, 160, 159, 158, 157, 173, 133, 155, 154, 153, 145, 144, 163, 7]

	left_eye_x = [int(np.clip(face_landmarks[idx].x, 0, 1) * w) for idx in left_eye_landmarks]
	left_eye_y = [int(np.clip(face_landmarks[idx].y, 0, 1) * h) for idx in left_eye_landmarks]
	left_eye_bbox = [(min(left_eye_x), min(left_eye_y)), (max(left_eye_x), max(left_eye_y))]
	left_eye_bbox = convert_bbox_to_square_bbox(left_eye_bbox, max_h, max_w, scale=eye_bbox_scale)

	right_eye_x = [int(np.clip(face_landmarks[idx].x, 0, 1) * w) for idx in right_eye_landmarks]
	right_eye_y = [int(np.clip(face_landmarks[idx].y, 0, 1) * h) for idx in right_eye_landmarks]
	right_eye_bbox = [(min(right_eye_x), min(right_eye_y)), (max(right_eye_x), max(right_eye_y))]
	right_eye_bbox = convert_bbox_to_square_bbox(right_eye_bbox, max_h, max_w, scale=eye_bbox_scale)

	eye_bbox = {"left_eye": left_eye_bbox, "right_eye": right_eye_bbox}

	all_eye_bounding_box.append(eye_bbox)

	face_contour = np.zeros_like(image)
	for landmark_id, landmark in enumerate(face_landmarks):
	cx, cy = int(landmark.x * w), int(landmark.y * h)
	if cy >= max_h or cx >= max_w: continue
	if cy < 0 or cx < 0: continue
	face_contour[cy, cx] = (255, 255, 255)
	all_face_contour.append(face_contour)

	if save_vis:
	import cv2
	x_min, y_min, x_max, y_max = mouth_bbox
	cv2.rectangle(image, (x_min, y_min), (x_max, y_max), (0, 0, 255), 2)

	for eye_key, bbox in eye_bbox.items():
	x_min, y_min, x_max, y_max = bbox
	color = (0, 0, 255)
	cv2.rectangle(image, (x_min, y_min), (x_max, y_max), color, 2)

	for landmark_id, landmark in enumerate(face_landmarks):
	cx, cy = int(landmark.x * w), int(landmark.y * h)
	circle_size = 2
	if landmark_id in mouth_landmarks:
	cv2.circle(image, (cx, cy), circle_size, (0, 0, 255), -1)
	elif landmark_id in left_eye_landmarks+right_eye_landmarks:
	cv2.circle(image, (cx, cy), circle_size, (0, 255, 0), -1)
	else:
	cv2.circle(image, (cx, cy), circle_size, (255, 255, 255), -1)
	cv2.imwrite('image_detect.png', image[:,:,::-1])
	# import pdb; pdb.set_trace()

	return (
	all_x,
	all_y,
	all_orientation,
	num_faces,
	all_keypoints,
	all_bounding_box,
	all_mouth_bounding_box,
	all_eye_bounding_box,
	all_face_contour,
	all_blendshapes,
	all_facial_transformation_matrices,
	annotated_image,
	)