import numpy as np import pandas as pd import cv2 # for camera feed import mediapipe as mp # for accessing and reading from webcam import tensorflow as tf # developer modules from params import LENGTH, DROP_Z, averaging_sets, point_landmarks_left, point_landmarks_right, FLATTEN, INPUT_SHAPE, RIGHT_HAND, LEFT_HAND, PADDING, CONSTANT_VALUE # Initiate mediapipe model and utils mp_holistic = mp.solutions.holistic # holistic model mp_drawing = mp.solutions.drawing_utils # drawing utilities # ------------------------------ # Mediapipe # ------------------------------ # function to extract coordinates (+visibility) of all landmarks --> keypoints # and concatenates everything into a flattened list def extract_keypoints(results): face = np.array([[r.x, r.y, r.z] for r in results.face_landmarks.landmark]) if results.face_landmarks else np.zeros([468, 3]) left_hand = np.array([[r.x, r.y, r.z] for r in results.left_hand_landmarks.landmark]) if results.left_hand_landmarks else np.zeros([21, 3]) pose = np.array([[r.x, r.y, r.z] for r in results.pose_landmarks.landmark]) if results.pose_landmarks else np.zeros([33, 3]) # x, y, z and extra value visibility right_hand = np.array([[r.x, r.y, r.z] for r in results.right_hand_landmarks.landmark]) if results.right_hand_landmarks else np.zeros([21, 3]) return np.concatenate([face, left_hand, pose, right_hand]) # original code # a flattened list with list of all face, left_hand, pose, right_hand landmark x, y, z, (+visibility) coordinates # ------------------------------ # Visualization # ------------------------------ # function to draw landmarks points and connecting lines on top of an image, e.g. on top of your camera feed def draw_styled_landmarks(image, results): # draw face connections mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_TESSELATION, mp_drawing.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=1), mp_drawing.DrawingSpec(color=(224,208,64), thickness=1, circle_radius=1)) # draw pose connections mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS, mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4), mp_drawing.DrawingSpec(color=(224,208,64), thickness=2, circle_radius=2)) # draw left hand connections mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS, mp_drawing.DrawingSpec(color=(224,208,64), thickness=2, circle_radius=4), mp_drawing.DrawingSpec(color=(235,206,135), thickness=2, circle_radius=2)) # draw right hand connections mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS, mp_drawing.DrawingSpec(color=(224,208,64), thickness=2, circle_radius=4), mp_drawing.DrawingSpec(color=(128,128,240), thickness=2, circle_radius=2)) # function to visualize predicted word probabilities with a dynamic real-time bar chart def prob_viz(pred, SELECTED_SIGNS, input_frame): output_frame = input_frame.copy() bar_zero = 15 for num, prob in enumerate(pred): cv2.rectangle(output_frame, pt1=(bar_zero, 65+num*50), pt2=(bar_zero+int(prob*100*5), 95+num*50), color=(200, 200, 200), thickness=-1) # cv2.rectangle(image, start_point, end_point, color, thickness) cv2.putText(img=output_frame, text=SELECTED_SIGNS[num], org=(bar_zero, 90+num*50), fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=1, color=(50, 50, 50), thickness=1, lineType=cv2.LINE_AA) # cv2.putText(image, 'OpenCV', org, font, fontScale, color, thickness, cv2.LINE_AA) return output_frame # ------------------------------ # Pre-processing # ------------------------------ # helper function for pre-processing def tf_nan_mean(x, axis=0): #calculates the mean of a TensorFlow tensor x along a specified axis while ignoring any NaN values in the tensor. return tf.reduce_sum(tf.where(tf.math.is_nan(x), tf.zeros_like(x), x), axis=axis) / tf.reduce_sum(tf.where(tf.math.is_nan(x), tf.zeros_like(x), tf.ones_like(x)), axis=axis) # helper function for pre-processing def right_hand_percentage(x): #calculates percentage of right hand usage right = tf.gather(x, RIGHT_HAND, axis=1) left = tf.gather(x, LEFT_HAND, axis=1) right_count = tf.reduce_sum(tf.where(tf.math.is_nan(right), tf.zeros_like(right), tf.ones_like(right))) left_count = tf.reduce_sum(tf.where(tf.math.is_nan(left), tf.zeros_like(left), tf.ones_like(left))) return right_count / (left_count+right_count) #generating preprocessing layer that will be added to final model class FeatureGen(tf.keras.layers.Layer): #defines custom tensorflow layer def __init__(self): #initializes layer super(FeatureGen, self).__init__() def call(self, x_in, MIRROR=False): #drop z coordinates if required if DROP_Z: x_in = x_in[:, :, 0:2] if MIRROR: #flipping x coordinates x_in = np.array(x_in) x_in[:, :, 0] = (x_in[:, :, 0]-1)*(-1) x_in = tf.convert_to_tensor(x_in) #generates list with mean values for landmarks that will be merged x_list = [tf.expand_dims(tf_nan_mean(x_in[:, av_set[0]:av_set[0]+av_set[1], :], axis=1), axis=1) for av_set in averaging_sets] #extracts specific columns from input x_in defined by landmarks handedness = right_hand_percentage(x_in) if handedness > 0.5: x_list.append(tf.gather(x_in, point_landmarks_right, axis=1)) else: x_list.append(tf.gather(x_in, point_landmarks_left, axis=1)) #concatenates the two tensors from above along axis 1/columns x = tf.concat(x_list, 1) #padding to desired length of sequence (defined by LENGTH) #get current number of rows x_padded = x current_rows = tf.shape(x_padded)[0] #if current number of rows is greater than desired number of rows, truncate excess rows if current_rows > LENGTH: x_padded = x_padded[:LENGTH, :, :] #if current number of rows is less than desired number of rows, add padding elif current_rows < LENGTH: #calculate amount of padding needed pad_rows = LENGTH - current_rows if PADDING ==4: #copy first/last frame if pad_rows %2 == 0: #if pad_rows is even padding_front = tf.repeat(x_padded[0:1, :], pad_rows//2, axis=0) padding_back = tf.repeat(x_padded[-1:, :], pad_rows//2, axis=0) else: #if pad_rows is odd padding_front = tf.repeat(x_padded[0:1, :], (pad_rows//2)+1, axis=0) padding_back = tf.repeat(x_padded[-1:, :], pad_rows//2, axis=0) x_padded = tf.concat([padding_front, x_padded, padding_back], axis=0) elif PADDING == 5: #copy last frame padding_back = tf.repeat(x_padded[-1:, :], pad_rows, axis=0) x_padded = tf.concat([x_padded, padding_back], axis=0) else: if PADDING ==1: #padding at start and end if pad_rows %2 == 0: #if pad_rows is even paddings = [[pad_rows//2, pad_rows//2], [0, 0], [0, 0]] else: #if pad_rows is odd paddings = [[pad_rows//2+1, pad_rows//2], [0, 0], [0, 0]] elif PADDING ==2: #padding only at the end of sequence paddings = [[0, pad_rows], [0, 0], [0, 0]] elif PADDING ==3: #no padding paddings = [[0, 0], [0, 0], [0, 0]] x_padded = tf.pad(x_padded, paddings, mode='CONSTANT', constant_values=CONSTANT_VALUE) x = x_padded current_rows = tf.shape(x)[0] #interpolate single missing values x = pd.DataFrame(np.array(x).flatten()).interpolate(method='linear', limit=2, limit_direction='both') #fill missing values with zeros x = tf.where(tf.math.is_nan(x), tf.zeros_like(x), x) #reshape data to 2D or 3D array if FLATTEN: x = tf.reshape(x, (1, current_rows*INPUT_SHAPE[1])) else: x = tf.reshape(x, (1, current_rows, INPUT_SHAPE[1])) return x #define converter using generated layer feature_converter = FeatureGen() # ------------------------------ # Real-time prediction # ------------------------------ def real_time_prediction(results, sequence, predictions, threshold, LENGTH, MODEL, SELECTED_LABELS, TRANSITION_FRAMES, SELECTED_SIGNS): sign = '' prob = 0 # Extract key points into a sequence keypoints = extract_keypoints(results) # extract keypoints x, y, z for face, left_hand, pose, right_hand from mediapipe holistic predictions, keypoints.shape e.g. (543, 3) sequence.append(keypoints) # keep appending keypoints (frames) to a sequence, np.array(sequence).shape e.g. (22, 543, 3) sequence = sequence[-LENGTH:] # takes last e.g. 22 frames of the sequence # Predict upon full sequence if len(sequence) == LENGTH: # pre-processing model_input = feature_converter(np.array(sequence)) #print(f'OMG! Frenzy Franzi is converting your mediapipe input! See how the shape is changing from {np.array(sequence).shape} to {model_input.shape}! SO AWESOME!!!') # prediction pred = MODEL.predict(model_input)[0] # MODEL.fit() expects something in shape (num_sequences, 30, 1662), e.g. (1, 30, 1662) for a single sequence pred = pred[SELECTED_LABELS] # selects only a subset of signs, as defined in SELECTED_LABELS predictions.append(np.argmax(pred)) # appends all predictions # 3. Visualization logic # makes sure the last x frames had the same prediction (more stable transition from one sign to another) if np.unique(predictions[-TRANSITION_FRAMES:])[0]==np.argmax(pred): # if the confidence of the most confident prediction is above threshold if pred[np.argmax(pred)] > threshold: sign = SELECTED_SIGNS[np.argmax(pred)] prob = pred[np.argmax(pred)] prob = np.round(float(prob), 2) else: sign = ' ' prob = 0 return sign, prob # ------------------------------ # Streamlit # ------------------------------