Spaces:

Jin-Ho
/

SignMeUp_Streamlit

Sleeping

File size: 10,893 Bytes

e817788

import numpy as np
import pandas as pd
import cv2 # for camera feed
import mediapipe as mp # for accessing and reading from webcam
import tensorflow as tf

# developer modules
from params import  LENGTH, DROP_Z, averaging_sets, point_landmarks_left, point_landmarks_right, FLATTEN, INPUT_SHAPE, RIGHT_HAND, LEFT_HAND, PADDING, CONSTANT_VALUE

# Initiate mediapipe model and utils
mp_holistic = mp.solutions.holistic # holistic model
mp_drawing = mp.solutions.drawing_utils # drawing utilities


# ------------------------------
# Mediapipe
# ------------------------------

# function to extract coordinates (+visibility) of all landmarks --> keypoints
# and concatenates everything into a flattened list 
def extract_keypoints(results): 
    face = np.array([[r.x, r.y, r.z] for r in results.face_landmarks.landmark]) if results.face_landmarks else np.zeros([468, 3])
    left_hand = np.array([[r.x, r.y, r.z] for r in results.left_hand_landmarks.landmark]) if results.left_hand_landmarks else np.zeros([21, 3])
    pose = np.array([[r.x, r.y, r.z] for r in results.pose_landmarks.landmark]) if results.pose_landmarks else np.zeros([33, 3]) # x, y, z and extra value visibility
    right_hand = np.array([[r.x, r.y, r.z] for r in results.right_hand_landmarks.landmark]) if results.right_hand_landmarks else np.zeros([21, 3])
    return np.concatenate([face, left_hand, pose, right_hand]) # original code
    # a flattened list with list of all face, left_hand, pose, right_hand landmark x, y, z, (+visibility) coordinates


# ------------------------------
# Visualization
# ------------------------------

# function to draw landmarks points and connecting lines on top of an image, e.g. on top of your camera feed
def draw_styled_landmarks(image, results): 
    # draw face connections
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_TESSELATION, 
                              mp_drawing.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=1), 
                              mp_drawing.DrawingSpec(color=(224,208,64), thickness=1, circle_radius=1))
    # draw pose connections
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS, 
                              mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4), 
                              mp_drawing.DrawingSpec(color=(224,208,64), thickness=2, circle_radius=2)) 
    # draw left hand connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                              mp_drawing.DrawingSpec(color=(224,208,64), thickness=2, circle_radius=4), 
                              mp_drawing.DrawingSpec(color=(235,206,135), thickness=2, circle_radius=2)) 
    # draw right hand connections
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                              mp_drawing.DrawingSpec(color=(224,208,64), thickness=2, circle_radius=4), 
                              mp_drawing.DrawingSpec(color=(128,128,240), thickness=2, circle_radius=2))
 
# function to visualize predicted word probabilities with a dynamic real-time bar chart
def prob_viz(pred, SELECTED_SIGNS, input_frame): 
    output_frame = input_frame.copy() 
    bar_zero = 15
    
    for num, prob in enumerate(pred): 
        cv2.rectangle(output_frame, 
                      pt1=(bar_zero, 65+num*50), 
                      pt2=(bar_zero+int(prob*100*5), 95+num*50), 
                      color=(200, 200, 200), thickness=-1)
        # cv2.rectangle(image, start_point, end_point, color, thickness)
        cv2.putText(img=output_frame, 
                    text=SELECTED_SIGNS[num], 
                    org=(bar_zero, 90+num*50), 
                    fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=1, 
                    color=(50, 50, 50), 
                    thickness=1, lineType=cv2.LINE_AA)
        # cv2.putText(image, 'OpenCV', org, font, fontScale, color, thickness, cv2.LINE_AA)
    return output_frame


# ------------------------------
# Pre-processing
# ------------------------------

# helper function for pre-processing
def tf_nan_mean(x, axis=0):
    #calculates the mean of a TensorFlow tensor x along a specified axis while ignoring any NaN values in the tensor.
    return tf.reduce_sum(tf.where(tf.math.is_nan(x), tf.zeros_like(x), x), axis=axis) / tf.reduce_sum(tf.where(tf.math.is_nan(x), tf.zeros_like(x), tf.ones_like(x)), axis=axis)

# helper function for pre-processing
def right_hand_percentage(x):
    #calculates percentage of right hand usage
    right = tf.gather(x, RIGHT_HAND, axis=1)
    left = tf.gather(x, LEFT_HAND, axis=1)
    right_count = tf.reduce_sum(tf.where(tf.math.is_nan(right), tf.zeros_like(right), tf.ones_like(right)))
    left_count = tf.reduce_sum(tf.where(tf.math.is_nan(left), tf.zeros_like(left), tf.ones_like(left)))
    return right_count / (left_count+right_count)

#generating preprocessing layer that will be added to final model
class FeatureGen(tf.keras.layers.Layer):
    #defines custom tensorflow layer 
    def __init__(self):
        #initializes layer
        super(FeatureGen, self).__init__()
    
    def call(self, x_in, MIRROR=False):
        #drop z coordinates if required
        if DROP_Z:
            x_in = x_in[:, :, 0:2]
        if MIRROR:
            #flipping x coordinates
            x_in = np.array(x_in)
            x_in[:, :, 0] = (x_in[:, :, 0]-1)*(-1)
            x_in = tf.convert_to_tensor(x_in)

        #generates list with mean values for landmarks that will be merged
        x_list = [tf.expand_dims(tf_nan_mean(x_in[:, av_set[0]:av_set[0]+av_set[1], :], axis=1), axis=1) for av_set in averaging_sets]
        
        #extracts specific columns from input x_in defined by landmarks
        handedness = right_hand_percentage(x_in)
        if handedness > 0.5:
            x_list.append(tf.gather(x_in, point_landmarks_right, axis=1))
        else: 
            x_list.append(tf.gather(x_in, point_landmarks_left, axis=1))

        #concatenates the two tensors from above along axis 1/columns
        x = tf.concat(x_list, 1)

        #padding to desired length of sequence (defined by LENGTH)
        #get current number of rows
        x_padded = x
        current_rows = tf.shape(x_padded)[0]
        #if current number of rows is greater than desired number of rows, truncate excess rows
        if current_rows > LENGTH:
            x_padded = x_padded[:LENGTH, :, :]

        #if current number of rows is less than desired number of rows, add padding
        elif current_rows < LENGTH:
            #calculate amount of padding needed
            pad_rows = LENGTH - current_rows

            if PADDING ==4: #copy first/last frame
                if pad_rows %2 == 0: #if pad_rows is even
                    padding_front = tf.repeat(x_padded[0:1, :], pad_rows//2, axis=0)
                    padding_back = tf.repeat(x_padded[-1:, :], pad_rows//2, axis=0)
                else: #if pad_rows is odd
                    padding_front = tf.repeat(x_padded[0:1, :], (pad_rows//2)+1, axis=0)
                    padding_back = tf.repeat(x_padded[-1:, :], pad_rows//2, axis=0)
                x_padded = tf.concat([padding_front, x_padded, padding_back], axis=0)
            elif PADDING == 5: #copy last frame
                padding_back = tf.repeat(x_padded[-1:, :], pad_rows, axis=0)
                x_padded = tf.concat([x_padded, padding_back], axis=0)
            else:
                if PADDING ==1: #padding at start and end
                    if pad_rows %2 == 0: #if pad_rows is even
                        paddings = [[pad_rows//2, pad_rows//2], [0, 0], [0, 0]]
                    else: #if pad_rows is odd
                        paddings = [[pad_rows//2+1, pad_rows//2], [0, 0], [0, 0]]
                elif PADDING ==2: #padding only at the end of sequence
                    paddings = [[0, pad_rows], [0, 0], [0, 0]]
                elif PADDING ==3: #no padding
                    paddings = [[0, 0], [0, 0], [0, 0]]
                x_padded = tf.pad(x_padded, paddings, mode='CONSTANT', constant_values=CONSTANT_VALUE)

        x = x_padded
        current_rows = tf.shape(x)[0]

        #interpolate single missing values
        x = pd.DataFrame(np.array(x).flatten()).interpolate(method='linear', limit=2, limit_direction='both')
        #fill missing values with zeros
        x = tf.where(tf.math.is_nan(x), tf.zeros_like(x), x)
        
        #reshape data to 2D or 3D array
        if FLATTEN:
            x = tf.reshape(x, (1, current_rows*INPUT_SHAPE[1]))
        else:
            x = tf.reshape(x, (1, current_rows, INPUT_SHAPE[1]))

        return x

#define converter using generated layer
feature_converter = FeatureGen()


# ------------------------------
# Real-time prediction 
# ------------------------------

def real_time_prediction(results, sequence, predictions, threshold, LENGTH, MODEL, SELECTED_LABELS, TRANSITION_FRAMES, SELECTED_SIGNS): 
    sign = ''
    prob = 0

    # Extract key points into a sequence
    keypoints = extract_keypoints(results) # extract keypoints x, y, z for face, left_hand, pose, right_hand from mediapipe holistic predictions, keypoints.shape e.g. (543, 3)
    sequence.append(keypoints) # keep appending keypoints (frames) to a sequence, np.array(sequence).shape e.g. (22, 543, 3)
    sequence = sequence[-LENGTH:] # takes last e.g. 22 frames of the sequence

    # Predict upon full sequence
    if len(sequence) == LENGTH: 
        # pre-processing
        model_input = feature_converter(np.array(sequence))
        #print(f'OMG! Frenzy Franzi is converting your mediapipe input! See how the shape is changing from {np.array(sequence).shape} to {model_input.shape}! SO AWESOME!!!')
        
        # prediction
        pred = MODEL.predict(model_input)[0] # MODEL.fit() expects something in shape (num_sequences, 30, 1662), e.g. (1, 30, 1662) for a single sequence                    
        pred = pred[SELECTED_LABELS] # selects only a subset of signs, as defined in SELECTED_LABELS
        predictions.append(np.argmax(pred)) # appends all predictions

        # 3. Visualization logic
        # makes sure the last x frames had the same prediction (more stable transition from one sign to another) 
        if np.unique(predictions[-TRANSITION_FRAMES:])[0]==np.argmax(pred): 
            # if the confidence of the most confident prediction is above threshold
            if pred[np.argmax(pred)] > threshold: 
                sign = SELECTED_SIGNS[np.argmax(pred)]
                prob = pred[np.argmax(pred)]
                prob = np.round(float(prob), 2)
            else: 
                sign = ' '
                prob = 0
                
    return sign, prob
                


# ------------------------------
# Streamlit
# ------------------------------