# -*- coding: utf-8 -*- """ ###################################################################################################################################### This work is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. To view a copy of this license, visit http://creativecommons.org/licenses/by-nc-sa/4.0/ or send a letter to Creative Commons, PO Box 1866, Mountain View, CA 94042, USA. Any publications arising from the use of this software, including but not limited to academic journal and conference publications, technical reports and manuals, must cite at least one of the following works: Revisiting Data Normalization for Appearance-Based Gaze Estimation Xucong Zhang, Yusuke Sugano, Andreas Bulling in Proc. International Symposium on Eye Tracking Research and Applications (ETRA), 2018 ###################################################################################################################################### """ import os import cv2 import numpy as np import csv import argparse # import dlib import glob def normalize_woimg(landmarks, focal_norm, distance_norm, roi_size, center, hr, ht, cam, gc=None): center = center.reshape(3,1) ## universal function for data normalization hR = cv2.Rodrigues(hr)[0] # rotation matrix ## ---------- normalize image ---------- distance = np.linalg.norm(center) # actual distance between eye and original camera z_scale = distance_norm/distance cam_norm = np.array([ [focal_norm, 0, roi_size[0]/2], [0, focal_norm, roi_size[1]/2], [0, 0, 1.0], ]) S = np.array([ # scaling matrix [1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, z_scale], ]) hRx = hR[:,0] forward = (center/distance).reshape(3) down = np.cross(forward, hRx) down /= np.linalg.norm(down) right = np.cross(down, forward) right /= np.linalg.norm(right) R = np.c_[right, down, forward].T # rotation matrix R W = np.dot(np.dot(cam_norm, S), np.dot(R, np.linalg.inv(cam))) # transformation matrix ## ---------- normalize rotation ---------- hR_norm = np.dot(R, hR) # rotation matrix in normalized space # hr_norm = cv2.Rodrigues(hR_norm)[0] # convert rotation matrix to rotation vectors ## ---------- normalize gaze vector ---------- gc_normalized = None num_point = landmarks.shape[0] landmarks_warped = cv2.perspectiveTransform(landmarks.reshape(-1,1,2).astype('float32'), W) landmarks_warped = landmarks_warped.reshape(num_point, 2) if gc is not None: gc_normalized = gc.reshape((3,1)) - center # gaze vector # For modified data normalization, scaling is not applied to gaze direction (only R applied). # For original data normalization, here should be: # "M = np.dot(S,R) # gc_normalized = np.dot(R, gc_normalized)" gc_normalized = np.dot(R, gc_normalized) gc_normalized = gc_normalized/np.linalg.norm(gc_normalized) return [None, R, hR_norm, gc_normalized, landmarks_warped, W] def normalize(img, landmarks, focal_norm, distance_norm, roi_size, center, hr, ht, cam, gc=None): center = center.reshape(3,1) ## universal function for data normalization hR = cv2.Rodrigues(hr)[0] # rotation matrix ## ---------- normalize image ---------- distance = np.linalg.norm(center) # actual distance between eye and original camera z_scale = distance_norm/distance cam_norm = np.array([ [focal_norm, 0, roi_size[0]/2], [0, focal_norm, roi_size[1]/2], [0, 0, 1.0], ]) S = np.array([ # scaling matrix [1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, z_scale], ]) hRx = hR[:,0] forward = (center/distance).reshape(3) down = np.cross(forward, hRx) down /= np.linalg.norm(down) right = np.cross(down, forward) right /= np.linalg.norm(right) R = np.c_[right, down, forward].T # rotation matrix R W = np.dot(np.dot(cam_norm, S), np.dot(R, np.linalg.inv(cam))) # transformation matrix # if img is not None: # img_warped = cv2.warpPerspective(img, W, roi_size) # image normalization # else: # img_warped = None img_warped = cv2.warpPerspective(img, W, roi_size) # image normalization ## ---------- normalize rotation ---------- hR_norm = np.dot(R, hR) # rotation matrix in normalized space # hr_norm = cv2.Rodrigues(hR_norm)[0] # convert rotation matrix to rotation vectors ## ---------- normalize gaze vector ---------- gc_normalized = None num_point = landmarks.shape[0] landmarks_warped = cv2.perspectiveTransform(landmarks.reshape(-1,1,2).astype('float32'), W) landmarks_warped = landmarks_warped.reshape(num_point, 2) if gc is not None: gc_normalized = gc.reshape((3,1)) - center # gaze vector # For modified data normalization, scaling is not applied to gaze direction (only R applied). # For original data normalization, here should be: # "M = np.dot(S,R) # gc_normalized = np.dot(R, gc_normalized)" gc_normalized = np.dot(R, gc_normalized) gc_normalized = gc_normalized/np.linalg.norm(gc_normalized) return [img_warped, R, hR_norm, gc_normalized, landmarks_warped, W] def normalize_face(img, face, hr, ht, cam, gc=None): ## normalized camera parameters focal_norm = 960 # focal length of normalized camera distance_norm = 600 # normalized distance between eye and camera roi_size = (224, 224) # size of cropped eye image ## compute estimated 3D positions of the landmarks ht = ht.reshape((3,1)) hR = cv2.Rodrigues(hr)[0] # rotation matrix Fc = np.dot(hR, face) + ht # 3D positions of facial landmarks # fm = np.mean(Fc, axis=1).reshape((3,1)) # center of facial landmarks two_eye_center = np.mean(Fc[:, 0:4], axis=1).reshape((3, 1)) nose_center = np.mean(Fc[:, 4:6], axis=1).reshape((3, 1)) # get the face center face_center = np.mean(np.concatenate((two_eye_center, nose_center), axis=1), axis=1).reshape((3, 1)) # face_center = np.mean(Fc, axis=1).reshape((3,1)) return normalize(img, focal_norm, distance_norm, roi_size, face_center, hr, ht, cam, gc) def normalize_eye(img, face, hr, ht, cam, gc=None): ## normalized camera parameters focal_norm = 960 # focal length of normalized camera distance_norm = 600 # normalized distance between eye and camera roi_size = (60, 36) # size of cropped eye image ## compute estimated 3D positions of the landmarks ht = ht.reshape((3,1)) hR = cv2.Rodrigues(hr)[0] # rotation matrix Fc = np.dot(hR, face) + ht # 3D positions of facial landmarks re = 0.5*(Fc[:,0] + Fc[:,1]).reshape((3,1)) # center of left eye le = 0.5*(Fc[:,2] + Fc[:,3]).reshape((3,1)) # center of right eye ## normalize each eye data = [ normalize(img, focal_norm, distance_norm, roi_size, re, hr, ht, cam, gc), normalize(img, focal_norm, distance_norm, roi_size, le, hr, ht, cam, gc) ] return data def load_calibration(calib_path): ## load calibration data, these paramters are expected to be obtained by camera calibration functions in OpenCV fs = cv2.FileStorage(calib_path, cv2.FILE_STORAGE_READ) camera_matrix = fs.getNode('camera_matrix').mat() camera_distortion = fs.getNode('dist_coeffs').mat() return camera_matrix, camera_distortion def load_facemodel(model_path): # load the generic face model, which includes 6 facial landmarks: four eye corners and two mouth corners fs = cv2.FileStorage(model_path, cv2.FILE_STORAGE_READ) face_model = fs.getNode('face_model').mat() return face_model def read_image(img_path, camera_matrix, camera_distortion): # load input image and undistort img_original = cv2.imread(img_path) img = cv2.undistort(img_original, camera_matrix, camera_distortion) return img def estimateHeadPose(landmarks, face_model, camera, distortion, iterate=True): ret, rvec, tvec = cv2.solvePnP(face_model, landmarks, camera, distortion, flags=cv2.SOLVEPNP_EPNP) ## further optimize if iterate: ret, rvec, tvec = cv2.solvePnP(face_model, landmarks, camera, distortion, rvec, tvec, True) return rvec, tvec def detect_landmark(img, detector_path, predictor_path): ## obtain facial landmarks using dlib detector = dlib.cnn_face_detection_model_v1(detector_path) dets = detector(img, 0) if len(dets) == 0: return None predictor = dlib.shape_predictor(predictor_path) shape = predictor(img, dets[0].rect) ## extract required keypoints landmarks = np.array([ [shape.part(36).x, shape.part(36).y], [shape.part(39).x, shape.part(39).y], [shape.part(42).x, shape.part(42).y], [shape.part(45).x, shape.part(45).y], [shape.part(48).x, shape.part(48).y], [shape.part(54).x, shape.part(54).y] ]) return landmarks def read_landmark(img_path): img_file = img_path.split(os.path.sep)[-1] day = img_path.split(os.path.sep)[-2] person = img_path.split(os.path.sep)[-3] person_path = os.path.split(os.path.split(img_path)[0])[0] person_txt = os.path.join(person_path, person+'.txt') index = os.path.join(day,img_file) print(person_txt) print(index) with open(person_txt) as f: data = f.readlines() reader = csv.reader(data) p = {} for row in reader: words = row[0].split() p[words[0]] = words[1:] landmarks = np.array([int(i) for i in p[index][2:14]]).reshape((6,2)) return landmarks # def process_image(img_path, detector_path, predictor_path, camera_matrix, camera_distortion, face_model, gc=None): # # read input image # img = read_image(img_path, camera_matrix, camera_distortion) # # detect facial landmarks # landmarks = detect_landmark(img, detector_path, predictor_path) # if landmarks is not None: # # estimate head pose # hr, ht = estimateHeadPose(face_model, landmarks, camera_matrix, camera_distortion) # # data normalization for left and right eye image # normalized_eyes = normalize_eye(img, face_model, hr, ht, camera_matrix, gc) # # data normalization for full face # normalized_face = normalize_face(img, face_model, hr, ht, camera_matrix, gc) # # return a list of [reye, leye, face] # return normalized_eyes + [normalized_face]