Spaces:
Runtime error
Runtime error
| # Facial Recognition with Emotion / Sentiment Detector | |
| # This is a custom, hard-coded version of darknet with | |
| # YOLOv3 implementation for openimages database. This | |
| # was written to test viability of implementing YOLO | |
| # for face detection followed by emotion / sentiment | |
| # analysis. | |
| # | |
| # Configuration, weights and data are hardcoded. | |
| # This version takes any images, detects faces, | |
| # and then runs emotion / sentiment analysis | |
| # | |
| # Author : Saikiran Tharimena | |
| # Co-Authors: Kjetil Marinius Sjulsen, Juan Carlos Calvet Lopez | |
| # Project : Emotion / Sentiment Detection from news images | |
| # Date : 12 September 2022 | |
| # Version : v0.1 | |
| # | |
| # (C) Schibsted ASA | |
| # Libraries | |
| import torch | |
| from utils import * | |
| import gradio as gr | |
| from numpy import array | |
| from darknet import Darknet | |
| from torch.autograd import Variable | |
| from torch.cuda import is_available as check_cuda | |
| from PIL.ImageOps import grayscale | |
| from fastai.vision.all import PILImage, load_learner | |
| ################## DARKNET ################## | |
| # Parameters | |
| batch_size = 1 | |
| confidence = 0.25 | |
| nms_thresh = 0.30 | |
| run_cuda = False | |
| # CFG Files | |
| cfg = 'cfg/yolov3-openimages.cfg' | |
| clsnames= 'cfg/openimages.names' | |
| weights = 'cfg/yolov3-openimages.weights' | |
| # Load classes | |
| classes = load_classes(clsnames) | |
| num_classes = len(classes) | |
| # Set up the neural network | |
| print('Load Network') | |
| model = Darknet(cfg) | |
| print('Load Weights') | |
| model.load_weights(weights) | |
| print('Successfully loaded Network') | |
| # Check CUDA | |
| if run_cuda: | |
| CUDA = check_cuda() | |
| else: | |
| CUDA = False | |
| # Input dimension | |
| inp_dim = int(model.net_info["height"]) | |
| # put the model on GPU | |
| if CUDA: | |
| model.cuda() | |
| # Set the model in evaluation mode | |
| model.eval() | |
| def get_detections(x): | |
| c1 = [int(y) for y in x[1:3]] | |
| c2 = [int(y) for y in x[3:5]] | |
| det_class = int(x[-1]) | |
| label = "{0}".format(classes[det_class]) | |
| return (label, tuple(c1 + c2)) | |
| # face detector | |
| def detector(image): | |
| # Just lazy to update this | |
| imlist = [image] | |
| loaded_ims = [image] | |
| im_batches = list(map(prep_image, loaded_ims, [inp_dim for x in range(len(imlist))])) | |
| im_dim_list = [(x.shape[1], x.shape[0]) for x in loaded_ims] | |
| im_dim_list = torch.FloatTensor(im_dim_list).repeat(1,2) | |
| leftover = 0 | |
| if (len(im_dim_list) % batch_size): | |
| leftover = 1 | |
| if batch_size != 1: | |
| num_batches = len(imlist) // batch_size + leftover | |
| im_batches = [torch.cat((im_batches[i*batch_size : min((i + 1)*batch_size, | |
| len(im_batches))])) for i in range(num_batches)] | |
| write = 0 | |
| if CUDA: | |
| im_dim_list = im_dim_list.cuda() | |
| for i, batch in enumerate(im_batches): | |
| # load the image | |
| if CUDA: | |
| batch = batch.cuda() | |
| with torch.no_grad(): | |
| prediction = model(Variable(batch), CUDA) | |
| prediction = write_results(prediction, confidence, num_classes, nms_conf = nms_thresh) | |
| if type(prediction) == int: | |
| for im_num, image in enumerate(imlist[i*batch_size: min((i + 1)*batch_size, len(imlist))]): | |
| im_id = i*batch_size + im_num | |
| continue | |
| prediction[:,0] += i*batch_size # transform the atribute from index in batch to index in imlist | |
| if not write: # If we have't initialised output | |
| output = prediction | |
| write = 1 | |
| else: | |
| output = torch.cat((output, prediction)) | |
| for im_num, image in enumerate(imlist[i*batch_size: min((i + 1)*batch_size, len(imlist))]): | |
| im_id = i * batch_size + im_num | |
| objs = [classes[int(x[-1])] for x in output if int(x[0]) == im_id] | |
| if CUDA: | |
| torch.cuda.synchronize() | |
| try: | |
| output | |
| except NameError: | |
| return loaded_ims[0], [] | |
| im_dim_list = torch.index_select(im_dim_list, 0, output[:,0].long()) | |
| scaling_factor = torch.min(608/im_dim_list,1)[0].view(-1,1) | |
| output[:, [1,3]] -= (inp_dim - scaling_factor*im_dim_list[:,0].view(-1,1))/2 | |
| output[:, [2,4]] -= (inp_dim - scaling_factor*im_dim_list[:,1].view(-1,1))/2 | |
| output[:, 1:5] /= scaling_factor | |
| for i in range(output.shape[0]): | |
| output[i, [1,3]] = torch.clamp(output[i, [1,3]], 0.0, im_dim_list[i,0]) | |
| output[i, [2,4]] = torch.clamp(output[i, [2,4]], 0.0, im_dim_list[i,1]) | |
| detections = list(map(get_detections, output)) | |
| if CUDA: | |
| torch.cuda.empty_cache() | |
| return loaded_ims[0], detections | |
| ############################################# | |
| # Emotion | |
| learn_emotion = load_learner('models/emotions_vgg19.pkl') | |
| learn_emotion_labels = learn_emotion.dls.vocab | |
| # Sentiment | |
| learn_sentiment = load_learner('models/sentiment_vgg19.pkl') | |
| learn_sentiment_labels = learn_sentiment.dls.vocab | |
| def crop_images(img, bbox): | |
| "Here image should be an image object from PILImage.create" | |
| # Coordinates of face in cv2 format | |
| xmin, ymin, xmax, ymax = bbox[1] | |
| # resize and crop face | |
| return img.crop((xmin, ymin, xmax, ymax)) | |
| def detect_person_face(img, detections): | |
| '''This function is called from within detect face. | |
| If only a person is detected, then this will crop | |
| image and then try to detect face again.''' | |
| faces = [] | |
| # Loop through people | |
| for detection in detections: | |
| # Get cropped image of person | |
| temp = crop_images(img, detection) | |
| # run detector again | |
| _, detect = detector(array(temp)[...,:3]) | |
| # check for human faces | |
| human_face = [idx for idx, val in enumerate(detect) if val[0] == 'Human face'] | |
| if len(human_face) == 0: | |
| continue | |
| # Force it to take only 1 face per person | |
| # crop face and append to list | |
| faces.append(crop_images(temp, detect[human_face[0]])) | |
| return faces | |
| def detect_face(img): | |
| _, detections = detector(array(img)[...,:3]) | |
| # check for human faces | |
| human_face = [idx for idx, val in enumerate(detections) if val[0] == 'Human face'] | |
| if len(human_face) == 0: | |
| human_face = [idx for idx, val in enumerate(detections) if val[0] == 'Person'] | |
| if len(human_face) == 0: | |
| return [] | |
| else: | |
| # Only get human face detections | |
| faces = detect_person_face(img, [detections[idx] for idx in human_face]) | |
| else: | |
| # Only get human face detections | |
| faces = [] | |
| for idx in human_face: | |
| faces.append(crop_images(img, detections[idx])) | |
| return faces | |
| # Predict | |
| def predict(img): | |
| img = PILImage.create(img) | |
| # Detect faces | |
| faces = detect_face(img) | |
| output = [] | |
| if len(faces) == 0: | |
| img = img.resize((48, 48)) | |
| pred_emotion, pred_emotion_idx, probs_emotion = learn_emotion.predict(array(grayscale(img))) | |
| pred_sentiment, pred_sentiment_idx, probs_sentiment = learn_sentiment.predict(array(grayscale(img))) | |
| emotions = {learn_emotion_labels[i]: float(probs_emotion[i]) for i in range(len(learn_emotion_labels))} | |
| sentiments = {learn_sentiment_labels[i]: float(probs_sentiment[i]) for i in range(len(learn_sentiment_labels))} | |
| output = [img, emotions, sentiments, img, emotions, sentiments, img, emotions, sentiments] | |
| else: # Max 3 for now | |
| for face in faces[:3]: | |
| img = face.resize((48, 48)) | |
| pred_emotion, pred_emotion_idx, probs_emotion = learn_emotion.predict(array(grayscale(img))) | |
| pred_sentiment, pred_sentiment_idx, probs_sentiment = learn_sentiment.predict(array(grayscale(img))) | |
| emotions = {learn_emotion_labels[i]: float(probs_emotion[i]) for i in range(len(learn_emotion_labels))} | |
| sentiments = {learn_sentiment_labels[i]: float(probs_sentiment[i]) for i in range(len(learn_sentiment_labels))} | |
| output.append(img) | |
| output.append(emotions) | |
| output.append(sentiments) | |
| temp = output[-3:] | |
| while len(output) < 9: | |
| output = output + temp | |
| return output | |
| # Gradio | |
| title = 'Face Recognition with Emotion and Sentiment Detector' | |
| description = gr.Markdown( | |
| """Ever wondered what a person might be feeling looking at their picture? | |
| Well, now you can! Try this fun app. Just upload a facial image in JPG or | |
| PNG format. Voila! you can now see what they might have felt when the picture | |
| was taken. | |
| This is an updated version of Facial Expression Classifier: | |
| https://huggingface.co/spaces/schibsted/facial_expression_classifier | |
| """).value | |
| article = gr.Markdown( | |
| """**DISCLAIMER:** This model does not reveal the actual emotional state of a person. Use and | |
| interpret results at your own risk! It was built as a demo for AI course. Samples images | |
| were downloaded from VG & AftenPosten news webpages. Copyrights belong to respective | |
| brands. All rights reserved. | |
| **PREMISE:** The idea is to determine an overall sentiment of a news site on a daily basis | |
| based on the pictures. We are restricting pictures to only include close-up facial | |
| images. | |
| **DATA:** FER2013 dataset consists of 48x48 pixel grayscale images of faces. There are 28,709 | |
| images in the training set and 3,589 images in the test set. However, for this demo all | |
| pictures were combined into a single dataset and 80:20 split was used for training. Images | |
| are assigned one of the 7 emotions: Angry, Disgust, Fear, Happy, Sad, Surprise, and Neutral. | |
| In addition to these 7 classes, images were re-classified into 3 sentiment categories based | |
| on emotions: | |
| Positive (Happy, Surprise) | |
| Negative (Angry, Disgust, Fear, Sad) | |
| Neutral (Neutral) | |
| FER2013 (preliminary version) dataset can be downloaded at: | |
| https://www.kaggle.com/c/challenges-in-representation-learning-facial-expression-recognition-challenge/data | |
| **EMOTION / SENTIMENT MODEL:** VGG19 was used as the base model and trained on FER2013 dataset. Model was trained | |
| using PyTorch and FastAI. Two models were trained, one for detecting emotion and the other | |
| for detecting sentiment. Although, this could have been done with just one model, here two | |
| models were trained for the demo. | |
| **FACE DETECTOR:** Darknet with YOLOv3 architecture was used for face detection. Reach out to me for full details. | |
| In short, any image is first sent through darknet. If face is detected, then it is passed through emotion/sentiment | |
| model for each face in the picture. If a person is detected rather than a face, the image is cropped and run through | |
| face detector again. If a face is detected, then it is passed through emotion/sentiment model. In case face is not | |
| detected in an image, then the entire image is evaluated to generate some score. This is done because, I couldn't | |
| figure out how to pipe None/blank output to Gradio.Interface(). There maybe option through Gradio.Blocks() but was | |
| too lazy to go through that at this stage. In addition, the output is restricted to only 3 faces in a picture. | |
| """).value | |
| enable_queue=True | |
| examples = ['happy1.jpg', 'happy2.jpg', 'angry1.png', 'angry2.jpg', 'neutral1.jpg', 'neutral2.jpg'] | |
| gr.Interface(fn = predict, | |
| inputs = gr.Image(), | |
| outputs = [gr.Image(shape=(48, 48), label='Person 1'), | |
| gr.Label(label='Emotion - Person 1'), | |
| gr.Label(label='Sentiment - Person 1'), | |
| gr.Image(shape=(48, 48), label='Person 2'), | |
| gr.Label(label='Emotion - Person 2'), | |
| gr.Label(label='Sentiment - Person 2'), | |
| gr.Image(shape=(48, 48), label='Person 3'), | |
| gr.Label(label='Emotion - Person 3'), | |
| gr.Label(label='Sentiment - Person 3'),], #gr.Label(), | |
| title = title, | |
| examples = examples, | |
| description = description, | |
| article=article, | |
| allow_flagging='never').launch(enable_queue=enable_queue) |