reab5555 commited on
Commit
2c1fa32
·
verified ·
1 Parent(s): 9706eac

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +103 -0
app.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import cv2
3
+ import numpy as np
4
+ import clip
5
+ import torch
6
+ from PIL import Image
7
+ import torch.nn.functional as F
8
+ from facenet_pytorch import MTCNN
9
+
10
+ # Global variables
11
+ input_labels_X = "Happy Face, Sad Face, Angry Face, Fear Face, Disgust Face, Contempt Face, Nervous Face, Curious Face, Flirtatious Face, Ashamed Face, Bored Face, Confused Face, Calm Face, Proud Face, Guilty Face, Annoyed Face, Desperate Face, Jealous Face, Embarrassed Face, Impatient Face, Uncomfortable Face, Bitter Face, Helpless Face, Shy Face, Infatuated Face, Betrayed Face, Shocked Face, Relaxed Face, Apathetic Face, Neutral Face"
12
+
13
+ device = "cuda" if torch.cuda.is_available() else "cpu"
14
+ model, preprocess = clip.load("ViT-L/14", device=device)
15
+
16
+ # Initialize MTCNN for face detection
17
+ mtcnn = MTCNN(keep_all=True, device=device)
18
+
19
+ def process_image(input_image, selected_model):
20
+ global model, preprocess
21
+
22
+ # Load the selected model if it's different from the current one
23
+ if selected_model != model.model_name:
24
+ model, preprocess = clip.load(selected_model, device=device)
25
+
26
+ # Convert input_image to numpy array
27
+ cv2_frame = np.array(input_image)
28
+ cv2_frame = cv2.cvtColor(cv2_frame, cv2.COLOR_RGB2BGR)
29
+
30
+ # Detect faces
31
+ frame_pil = Image.fromarray(cv2.cvtColor(cv2_frame, cv2.COLOR_BGR2RGB))
32
+ boxes, _ = mtcnn.detect(frame_pil)
33
+
34
+ # Find the largest face detected
35
+ largest_face = None
36
+ if boxes is not None and len(boxes) > 0:
37
+ largest_face = max(boxes, key=lambda box: (box[2] - box[0]) * (box[3] - box[1]))
38
+
39
+ # Process the largest face
40
+ if largest_face is not None:
41
+ x, y, w, h = map(int, largest_face)
42
+ cv2.rectangle(cv2_frame, (x, y), (w, h), (0, 255, 0), 2)
43
+ cropped_face = cv2_frame[y:h, x:w]
44
+
45
+ # Convert the cropped face to grayscale
46
+ frame_gray = cv2.cvtColor(cropped_face, cv2.COLOR_BGR2GRAY)
47
+ frame_gray_resized = cv2.resize(frame_gray, (224, 224))
48
+
49
+ # Convert the resized grayscale image to a tensor
50
+ frame_tensor = preprocess(Image.fromarray(frame_gray_resized)).unsqueeze(0).to(device)
51
+
52
+ # Tokenize input labels and prepare for model
53
+ input_labels = input_labels_X.split(", ")
54
+ text = clip.tokenize(input_labels).to(device)
55
+
56
+ with torch.no_grad():
57
+ # Encode the frame and text
58
+ image_features = model.encode_image(frame_tensor)
59
+ text_features = model.encode_text(text)
60
+
61
+ # Calculate logit
62
+ logit_per_image, logit_per_text = model(frame_tensor, text)
63
+
64
+ # Apply softmax to convert logits to probabilities
65
+ probabilities = F.softmax(logit_per_image[0], dim=0)
66
+
67
+ # Combine labels with probabilities and sort
68
+ combined_labels_probs = list(zip(input_labels, probabilities.tolist()))
69
+ combined_labels_probs.sort(key=lambda x: x[1], reverse=True)
70
+ top_five_labels_probs = combined_labels_probs[:5]
71
+
72
+ # Prepare results
73
+ results = []
74
+ for label, prob in top_five_labels_probs:
75
+ results.append(f"{label.strip()}: {prob * 100:.1f}%")
76
+
77
+ # Draw results on the image
78
+ for idx, result in enumerate(results):
79
+ cv2.putText(cv2_frame, result, (10, 30 + idx * 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
80
+
81
+ # Convert back to RGB for display
82
+ output_image = cv2.cvtColor(cv2_frame, cv2.COLOR_BGR2RGB)
83
+ return output_image, "\n".join(results)
84
+ else:
85
+ return cv2_frame, "No face detected"
86
+
87
+ # Define the Gradio interface
88
+ iface = gr.Interface(
89
+ fn=process_image,
90
+ inputs=[
91
+ gr.Image(type="pil"),
92
+ gr.Dropdown(choices=["ViT-B/32", "ViT-B/16", "ViT-L/14"], label="Model", value="ViT-L/14")
93
+ ],
94
+ outputs=[
95
+ gr.Image(type="pil", label="Processed Image"),
96
+ gr.Textbox(label="Emotion Probabilities")
97
+ ],
98
+ title="Emotion Recognition",
99
+ description="Upload an image to detect faces and recognize emotions."
100
+ )
101
+
102
+ # Launch the app
103
+ iface.launch()