Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import cv2
|
| 3 |
+
import numpy as np
|
| 4 |
+
import clip
|
| 5 |
+
import torch
|
| 6 |
+
from PIL import Image
|
| 7 |
+
import torch.nn.functional as F
|
| 8 |
+
from facenet_pytorch import MTCNN
|
| 9 |
+
|
| 10 |
+
# Global variables
|
| 11 |
+
input_labels_X = "Happy Face, Sad Face, Angry Face, Fear Face, Disgust Face, Contempt Face, Nervous Face, Curious Face, Flirtatious Face, Ashamed Face, Bored Face, Confused Face, Calm Face, Proud Face, Guilty Face, Annoyed Face, Desperate Face, Jealous Face, Embarrassed Face, Impatient Face, Uncomfortable Face, Bitter Face, Helpless Face, Shy Face, Infatuated Face, Betrayed Face, Shocked Face, Relaxed Face, Apathetic Face, Neutral Face"
|
| 12 |
+
|
| 13 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 14 |
+
model, preprocess = clip.load("ViT-L/14", device=device)
|
| 15 |
+
|
| 16 |
+
# Initialize MTCNN for face detection
|
| 17 |
+
mtcnn = MTCNN(keep_all=True, device=device)
|
| 18 |
+
|
| 19 |
+
def process_image(input_image, selected_model):
|
| 20 |
+
global model, preprocess
|
| 21 |
+
|
| 22 |
+
# Load the selected model if it's different from the current one
|
| 23 |
+
if selected_model != model.model_name:
|
| 24 |
+
model, preprocess = clip.load(selected_model, device=device)
|
| 25 |
+
|
| 26 |
+
# Convert input_image to numpy array
|
| 27 |
+
cv2_frame = np.array(input_image)
|
| 28 |
+
cv2_frame = cv2.cvtColor(cv2_frame, cv2.COLOR_RGB2BGR)
|
| 29 |
+
|
| 30 |
+
# Detect faces
|
| 31 |
+
frame_pil = Image.fromarray(cv2.cvtColor(cv2_frame, cv2.COLOR_BGR2RGB))
|
| 32 |
+
boxes, _ = mtcnn.detect(frame_pil)
|
| 33 |
+
|
| 34 |
+
# Find the largest face detected
|
| 35 |
+
largest_face = None
|
| 36 |
+
if boxes is not None and len(boxes) > 0:
|
| 37 |
+
largest_face = max(boxes, key=lambda box: (box[2] - box[0]) * (box[3] - box[1]))
|
| 38 |
+
|
| 39 |
+
# Process the largest face
|
| 40 |
+
if largest_face is not None:
|
| 41 |
+
x, y, w, h = map(int, largest_face)
|
| 42 |
+
cv2.rectangle(cv2_frame, (x, y), (w, h), (0, 255, 0), 2)
|
| 43 |
+
cropped_face = cv2_frame[y:h, x:w]
|
| 44 |
+
|
| 45 |
+
# Convert the cropped face to grayscale
|
| 46 |
+
frame_gray = cv2.cvtColor(cropped_face, cv2.COLOR_BGR2GRAY)
|
| 47 |
+
frame_gray_resized = cv2.resize(frame_gray, (224, 224))
|
| 48 |
+
|
| 49 |
+
# Convert the resized grayscale image to a tensor
|
| 50 |
+
frame_tensor = preprocess(Image.fromarray(frame_gray_resized)).unsqueeze(0).to(device)
|
| 51 |
+
|
| 52 |
+
# Tokenize input labels and prepare for model
|
| 53 |
+
input_labels = input_labels_X.split(", ")
|
| 54 |
+
text = clip.tokenize(input_labels).to(device)
|
| 55 |
+
|
| 56 |
+
with torch.no_grad():
|
| 57 |
+
# Encode the frame and text
|
| 58 |
+
image_features = model.encode_image(frame_tensor)
|
| 59 |
+
text_features = model.encode_text(text)
|
| 60 |
+
|
| 61 |
+
# Calculate logit
|
| 62 |
+
logit_per_image, logit_per_text = model(frame_tensor, text)
|
| 63 |
+
|
| 64 |
+
# Apply softmax to convert logits to probabilities
|
| 65 |
+
probabilities = F.softmax(logit_per_image[0], dim=0)
|
| 66 |
+
|
| 67 |
+
# Combine labels with probabilities and sort
|
| 68 |
+
combined_labels_probs = list(zip(input_labels, probabilities.tolist()))
|
| 69 |
+
combined_labels_probs.sort(key=lambda x: x[1], reverse=True)
|
| 70 |
+
top_five_labels_probs = combined_labels_probs[:5]
|
| 71 |
+
|
| 72 |
+
# Prepare results
|
| 73 |
+
results = []
|
| 74 |
+
for label, prob in top_five_labels_probs:
|
| 75 |
+
results.append(f"{label.strip()}: {prob * 100:.1f}%")
|
| 76 |
+
|
| 77 |
+
# Draw results on the image
|
| 78 |
+
for idx, result in enumerate(results):
|
| 79 |
+
cv2.putText(cv2_frame, result, (10, 30 + idx * 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
|
| 80 |
+
|
| 81 |
+
# Convert back to RGB for display
|
| 82 |
+
output_image = cv2.cvtColor(cv2_frame, cv2.COLOR_BGR2RGB)
|
| 83 |
+
return output_image, "\n".join(results)
|
| 84 |
+
else:
|
| 85 |
+
return cv2_frame, "No face detected"
|
| 86 |
+
|
| 87 |
+
# Define the Gradio interface
|
| 88 |
+
iface = gr.Interface(
|
| 89 |
+
fn=process_image,
|
| 90 |
+
inputs=[
|
| 91 |
+
gr.Image(type="pil"),
|
| 92 |
+
gr.Dropdown(choices=["ViT-B/32", "ViT-B/16", "ViT-L/14"], label="Model", value="ViT-L/14")
|
| 93 |
+
],
|
| 94 |
+
outputs=[
|
| 95 |
+
gr.Image(type="pil", label="Processed Image"),
|
| 96 |
+
gr.Textbox(label="Emotion Probabilities")
|
| 97 |
+
],
|
| 98 |
+
title="Emotion Recognition",
|
| 99 |
+
description="Upload an image to detect faces and recognize emotions."
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
# Launch the app
|
| 103 |
+
iface.launch()
|