Spaces:

SakibHasan
/

Emotion_classifier2

Build error

App Files Files Community

SakibRumu commited on Apr 13, 2025

Commit

d72a17d

verified ·

1 Parent(s): 183e2bc

Update app.py

Browse files

Files changed (1) hide show

app.py +404 -63

app.py CHANGED Viewed

@@ -1,79 +1,420 @@
 import torch
 import torch.nn as nn
-import gradio as gr
-from torchvision import models, transforms
 from PIL import Image
-from transformers import ViTModel
-# Define Hybrid CNN + Transformer
-class HybridCNNTransformer(nn.Module):
-    def __init__(self, num_classes=7):
-        super(HybridCNNTransformer, self).__init__()
-        self.cnn = models.resnet50(pretrained=True)
-        self.cnn = nn.Sequential(*list(self.cnn.children())[:-2])
-        self.channel_reduction = nn.Conv2d(2048, 64, kernel_size=1)
-        self.to_rgb = nn.Conv2d(64, 3, kernel_size=1)
-        self.transformer = ViTModel.from_pretrained("google/vit-base-patch16-224")
-        self.fc = nn.Sequential(
-            nn.Linear(768, 512),
-            nn.ReLU(),
-            nn.Dropout(0.3),
-            nn.Linear(512, num_classes)
-        )
-    def forward(self, x):
-        x = self.cnn(x)
-        x = self.channel_reduction(x)
-        x = self.to_rgb(x)
-        x = nn.functional.interpolate(x, size=(224, 224), mode="bilinear")
-        x = self.transformer(pixel_values=x).last_hidden_state[:, 0, :]
-        return self.fc(x)
-# Load model
-model = HybridCNNTransformer(num_classes=7)
-model.load_state_dict(torch.load("transformerHybrid_emotation_model.pth", map_location=torch.device('cpu')), strict=False)
-model.eval()
-# Transform
 transform = transforms.Compose([
     transforms.Resize((224, 224)),
     transforms.ToTensor(),
     transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
 ])
-# Prediction function
 def predict_emotion(image):
-    image = transform(image).unsqueeze(0)
-    with torch.no_grad():
-        output = model(image)
-        probs = torch.nn.functional.softmax(output, dim=1)
-        conf, pred = torch.max(probs, 1)
-    labels = ["Angry", "Disgust", "Fear", "Happy", "Neutral", "Sad", "Surprise"]
-    return labels[pred.item()], f"{conf.item() * 100:.2f}%"
-# Interface
-css = """
-body {
-    background-color: #1e1e1e;
-    color: white;
-}
-#component-1 {
-    background-color: rgba(255, 255, 255, 0.7);
-    padding: 20px;
-    border-radius: 10px;
-}
-#component-2 {
-    color: black;
-    font-weight: bold;
-}
-"""
-gr.Interface(
     fn=predict_emotion,
-    inputs=gr.Image(type="pil"),
-    outputs=[gr.Textbox(label="Predicted Emotion"), gr.Textbox(label="Confidence")],
-    title="Emotion Classification",
-    description="Upload an image to predict the emotion expressed using a Hybrid CNN + ViT model.",
-    css=css
-).launch()

+import gradio as gr
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
+import torchvision.transforms as transforms
+import numpy as np
 from PIL import Image
+import cv2
+import dlib
+import os
+import requests
+import bz2
+import shutil
+from efficientnet_pytorch import EfficientNet
+# Define paths
+SHAPE_PREDICTOR_URL = "http://dlib.net/files/shape_predictor_68_face_landmarks.dat.bz2"
+SHAPE_PREDICTOR_PATH = "shape_predictor_68_face_landmarks.dat"
+MODEL_WEIGHTS_PATH = "quad_stream_model_rafdb.pth"  # Update if weights are in a different path
+# Download and extract shape predictor if not present
+def download_shape_predictor():
+    if not os.path.exists(SHAPE_PREDICTOR_PATH):
+        print("Downloading shape predictor...")
+        response = requests.get(SHAPE_PREDICTOR_URL, stream=True)
+        with open("shape_predictor_68_face_landmarks.dat.bz2", "wb") as f:
+            f.write(response.content)
+        print("Extracting shape predictor...")
+        with bz2.BZ2File("shape_predictor_68_face_landmarks.dat.bz2", "rb") as f_in:
+            with open(SHAPE_PREDICTOR_PATH, "wb") as f_out:
+                shutil.copyfileobj(f_in, f_out)
+        os.remove("shape_predictor_68_face_landmarks.dat.bz2")
+        print("Shape predictor ready.")
+    else:
+        print("Shape predictor already exists.")
+download_shape_predictor()
+# Initialize Dlib detector and predictor
+detector = dlib.get_frontal_face_detector()
+predictor = dlib.shape_predictor(SHAPE_PREDICTOR_PATH)
+# Class mapping for RAF-DB
+class_mapping = {
+    0: "Surprise",
+    1: "Fear",
+    2: "Disgust",
+    3: "Happiness",
+    4: "Sadness",
+    5: "Anger",
+    6: "Neutral"
+}
+# Transform for input images
 transform = transforms.Compose([
     transforms.Resize((224, 224)),
     transforms.ToTensor(),
     transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
 ])
+# Function to extract landmark features
+def extract_landmark_features(image):
+    image_np = np.array(image)
+    gray = cv2.cvtColor(image_np, cv2.COLOR_RGB2GRAY)
+    h, w = image_np.shape[:2]
+    faces = detector(gray)
+    if len(faces) == 0:
+        return np.zeros(14, dtype=np.float32)
+    face = faces[0]
+    shape = predictor(gray, face)
+    landmarks = [(shape.part(i).x, shape.part(i).y) for i in range(68)]
+    key_points = {
+        'left_eye': landmarks[36],
+        'right_eye': landmarks[45],
+        'nose_tip': landmarks[30],
+        'mouth_left': landmarks[48],
+        'mouth_right': landmarks[54],
+        'left_eyebrow': landmarks[19],
+        'right_eyebrow': landmarks[24],
+        'jaw_left': landmarks[5],
+        'jaw_right': landmarks[11],
+        'chin': landmarks[8],
+        'left_lower_eyelid': landmarks[41],
+        'right_lower_eyelid': landmarks[46],
+        'left_cheek': landmarks[2],
+        'right_cheek': landmarks[14]
+    }
+    features = []
+    eye_dist = np.sqrt((key_points['left_eye'][0] - key_points['right_eye'][0])**2 +
+                       (key_points['left_eye'][1] - key_points['right_eye'][1])**2)
+    features.append(eye_dist)
+    mouth_width = np.sqrt((key_points['mouth_left'][0] - key_points['mouth_right'][0])**2 +
+                          (key_points['mouth_left'][1] - key_points['mouth_right'][1])**2)
+    features.append(mouth_width)
+    nose_to_mouth_left = np.sqrt((key_points['nose_tip'][0] - key_points['mouth_left'][0])**2 +
+                                 (key_points['nose_tip'][1] - key_points['mouth_left'][1])**2)
+    nose_to_mouth_right = np.sqrt((key_points['nose_tip'][0] - key_points['mouth_right'][0])**2 +
+                                  (key_points['nose_tip'][1] - key_points['mouth_right'][1])**2)
+    features.extend([nose_to_mouth_left, nose_to_mouth_right])
+    left_eye_to_nose = np.sqrt((key_points['left_eye'][0] - key_points['nose_tip'][0])**2 +
+                               (key_points['left_eye'][1] - key_points['nose_tip'][1])**2)
+    right_eye_to_nose = np.sqrt((key_points['right_eye'][0] - key_points['nose_tip'][0])**2 +
+                                (key_points['right_eye'][1] - key_points['nose_tip'][1])**2)
+    features.extend([left_eye_to_nose, right_eye_to_nose])
+    vec1 = np.array([key_points['left_eye'][0] - key_points['nose_tip'][0],
+                     key_points['left_eye'][1] - key_points['nose_tip'][1]])
+    vec2 = np.array([key_points['right_eye'][0] - key_points['nose_tip'][0],
+                     key_points['right_eye'][1] - key_points['nose_tip'][1]])
+    cos_angle = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2) + 1e-8)
+    angle = np.arccos(np.clip(cos_angle, -1.0, 1.0))
+    features.append(angle)
+    mouth_center = ((key_points['mouth_left'][0] + key_points['mouth_right'][0]) / 2,
+                    (key_points['mouth_left'][1] + key_points['mouth_right'][1]) / 2)
+    mouth_to_left_eye = np.sqrt((mouth_center[0] - key_points['left_eye'][0])**2 +
+                                (mouth_center[1] - key_points['left_eye'][1])**2)
+    mouth_to_right_eye = np.sqrt((mouth_center[0] - key_points['right_eye'][0])**2 +
+                                 (mouth_center[1] - key_points['right_eye'][1])**2)
+    features.extend([mouth_to_left_eye, mouth_to_right_eye])
+    mouth_aspect_ratio = mouth_width / (nose_to_mouth_left + nose_to_mouth_right + 1e-8)
+    features.append(mouth_aspect_ratio)
+    left_eyebrow_to_eye = np.sqrt((key_points['left_eyebrow'][0] - key_points['left_eye'][0])**2 +
+                                  (key_points['left_eyebrow'][1] - key_points['left_eye'][1])**2)
+    right_eyebrow_to_eye = np.sqrt((key_points['right_eyebrow'][0] - key_points['right_eye'][0])**2 +
+                                   (key_points['right_eyebrow'][1] - key_points['right_eye'][1])**2)
+    features.extend([left_eyebrow_to_eye, right_eyebrow_to_eye])
+    left_au6 = np.sqrt((key_points['left_lower_eyelid'][0] - key_points['left_cheek'][0])**2 +
+                       (key_points['left_lower_eyelid'][1] - key_points['left_cheek'][1])**2)
+    right_au6 = np.sqrt((key_points['right_lower_eyelid'][0] - key_points['right_cheek'][0])**2 +
+                        (key_points['right_lower_eyelid'][1] - key_points['right_cheek'][1])**2)
+    avg_au6 = (left_au6 + right_au6) / 2
+    features.append(avg_au6)
+    mouth_left_to_chin = np.sqrt((key_points['mouth_left'][0] - key_points['chin'][0])**2 +
+                                 (key_points['mouth_left'][1] - key_points['chin'][1])**2)
+    mouth_right_to_chin = np.sqrt((key_points['mouth_right'][0] - key_points['chin'][0])**2 +
+                                  (key_points['mouth_right'][1] - key_points['chin'][1])**2)
+    avg_au12 = (mouth_left_to_chin + mouth_right_to_chin) / (2 * (mouth_width + 1e-8))
+    features.append(avg_au12)
+    return np.array(features, dtype=np.float32)
+# Function to get landmark mask
+def get_landmark_mask(image, target_size=(7, 7)):
+    image_np = np.array(image)
+    gray = cv2.cvtColor(image_np, cv2.COLOR_RGB2GRAY)
+    h, w = image_np.shape[:2]
+    faces = detector(gray)
+    if len(faces) == 0:
+        return np.ones(target_size, dtype=np.float32)
+    face = faces[0]
+    shape = predictor(gray, face)
+    landmarks = [(shape.part(i).x, shape.part(i).y) for i in range(68)]
+    mask = np.zeros((h, w), dtype=np.float32)
+    eye_indices = [36, 39, 42, 45]
+    mouth_indices = [48, 54, 51, 57]
+    eyebrow_indices = [19, 24]
+    jaw_indices = [5, 11, 8]
+    cheek_indices = [2, 14]
+    key_points = [landmarks[i] for i in eye_indices + mouth_indices + eyebrow_indices + jaw_indices + cheek_indices]
+    for i, (x, y) in enumerate(key_points):
+        radius = 30 if i in [4, 5, 6, 7, 12, 13] else 20
+        cv2.circle(mask, (x, y), radius, 1.0, -1)
+    mask = cv2.resize(mask, target_size, interpolation=cv2.INTER_LINEAR)
+    mask = np.clip(mask, 0, 1)
+    return mask
+# Model definitions
+class EfficientNetBackbone(nn.Module):
+    def __init__(self):
+        super(EfficientNetBackbone, self).__init__()
+        self.efficientnet = EfficientNet.from_pretrained('efficientnet-b4')
+        self.efficientnet._conv_stem = nn.Conv2d(3, 48, kernel_size=3, stride=2, padding=1, bias=False)
+        self.channel_reducer = nn.Conv2d(1792, 256, kernel_size=1, stride=1, padding=0, bias=False)
+        self.bn = nn.BatchNorm2d(256)
+        nn.init.xavier_uniform_(self.channel_reducer.weight)
+    def forward(self, x):
+        x = self.efficientnet.extract_features(x)
+        x = self.channel_reducer(x)
+        x = self.bn(x)
+        return x
+class HLA(nn.Module):
+    def __init__(self, in_channels=256, reduction=4):
+        super(HLA, self).__init__()
+        reduced_channels = in_channels // reduction
+        self.spatial_branch1 = nn.Conv2d(in_channels, reduced_channels, 1)
+        self.spatial_branch2 = nn.Conv2d(in_channels, reduced_channels, 1)
+        self.sigmoid = nn.Sigmoid()
+        self.channel_restore = nn.Conv2d(reduced_channels, in_channels, 1)
+        self.channel_attention = nn.Sequential(
+            nn.AdaptiveAvgPool2d(1),
+            nn.Conv2d(in_channels, in_channels // reduction, 1, bias=False),
+            nn.ReLU(),
+            nn.Conv2d(in_channels // reduction, in_channels, 1, bias=False),
+            nn.Sigmoid()
+        )
+        self.bn = nn.BatchNorm2d(in_channels, eps=1e-5)
+        self.dropout = nn.Dropout2d(0.2)
+    def forward(self, x, landmark_mask=None):
+        b1 = self.spatial_branch1(x)
+        b2 = self.spatial_branch2(x)
+        spatial_attn = self.sigmoid(torch.max(b1, b2))
+        spatial_attn = self.channel_restore(spatial_attn)
+        if landmark_mask is not None:
+            landmark_mask = torch.tensor(landmark_mask, dtype=x.dtype)
+            landmark_mask = landmark_mask.view(-1, 1, 7, 7)
+            spatial_attn = spatial_attn * landmark_mask
+        spatial_attn = self.dropout(spatial_attn)
+        spatial_out = x * spatial_attn
+        channel_attn = self.channel_attention(spatial_out)
+        channel_attn = self.dropout(channel_attn)
+        out = spatial_out * channel_attn
+        out = self.bn(out)
+        return out
+class ViT(nn.Module):
+    def __init__(self, in_channels=256, patch_size=1, embed_dim=768, num_layers=8, num_heads=12):
+        super(ViT, self).__init__()
+        self.patch_embed = nn.Conv2d(in_channels, embed_dim, kernel_size=patch_size, stride=patch_size)
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        num_patches = (7 // patch_size) * (7 // patch_size)
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
+        self.transformer = nn.ModuleList([
+            nn.TransformerEncoderLayer(embed_dim, num_heads, dim_feedforward=1536, activation="gelu")
+            for _ in range(num_layers)
+        ])
+        self.ln = nn.LayerNorm(embed_dim)
+        self.bn = nn.BatchNorm1d(embed_dim, eps=1e-5)
+        nn.init.xavier_uniform_(self.patch_embed.weight)
+        nn.init.zeros_(self.patch_embed.bias)
+        nn.init.normal_(self.cls_token, std=0.02)
+        nn.init.normal_(self.pos_embed, std=0.02)
+    def forward(self, x):
+        x = self.patch_embed(x)
+        x = x.flatten(2).transpose(1, 2)
+        cls_tokens = self.cls_token.expand(x.size(0), -1, -1)
+        x = torch.cat([cls_tokens, x], dim=1)
+        x = x + self.pos_embed
+        for layer in self.transformer:
+            x = layer(x)
+        x = x[:, 0]
+        x = self.ln(x)
+        x = self.bn(x)
+        return x
+class IntensityStream(nn.Module):
+    def __init__(self, in_channels=256):
+        super(IntensityStream, self).__init__()
+        sobel_x = torch.tensor([[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]], dtype=torch.float32)
+        sobel_y = torch.tensor([[-1, -2, -1], [0, 0, 0], [1, 2, 1]], dtype=torch.float32)
+        self.sobel_x = nn.Conv2d(in_channels, in_channels, 3, padding=1, bias=False, groups=in_channels)
+        self.sobel_y = nn.Conv2d(in_channels, in_channels, 3, padding=1, bias=False, groups=in_channels)
+        self.sobel_x.weight.data = sobel_x.repeat(in_channels, 1, 1, 1)
+        self.sobel_y.weight.data = sobel_y.repeat(in_channels, 1, 1, 1)
+        self.conv = nn.Conv2d(in_channels, 128, 3, padding=1)
+        self.bn = nn.BatchNorm2d(128, eps=1e-5)
+        self.pool = nn.AdaptiveAvgPool2d(1)
+        self.attention = nn.MultiheadAttention(embed_dim=128, num_heads=1)
+        nn.init.xavier_uniform_(self.conv.weight)
+        nn.init.zeros_(self.conv.bias)
+    def forward(self, x):
+        gx = self.sobel_x(x)
+        gy = self.sobel_y(x)
+        grad_magnitude = torch.sqrt(gx**2 + gy**2 + 1e-8)
+        variance = ((x - x.mean(dim=1, keepdim=True))**2).mean(dim=1).flatten(1)
+        cnn_out = F.relu(self.conv(grad_magnitude))
+        cnn_out = self.bn(cnn_out)
+        texture_out = self.pool(cnn_out).squeeze(-1).squeeze(-1)
+        attn_in = cnn_out.flatten(2).permute(2, 0, 1)
+        attn_in = attn_in / (attn_in.norm(dim=-1, keepdim=True) + 1e-8)
+        attn_out, _ = self.attention(attn_in, attn_in, attn_in)
+        context_out = attn_out.mean(dim=0)
+        out = torch.cat([texture_out, context_out], dim=1)
+        return out, grad_magnitude, variance
+class LandmarkStream(nn.Module):
+    def __init__(self, input_dim=14, embed_dim=768):
+        super(LandmarkStream, self).__init__()
+        self.fc1 = nn.Linear(input_dim, 128)
+        self.fc2 = nn.Linear(128, 256)
+        self.fc3 = nn.Linear(256, embed_dim)
+        self.bn1 = nn.BatchNorm1d(128)
+        self.bn2 = nn.BatchNorm1d(256)
+        self.bn3 = nn.BatchNorm1d(embed_dim)
+        self.dropout = nn.Dropout(0.4)
+        nn.init.xavier_uniform_(self.fc1.weight)
+        nn.init.zeros_(self.fc1.bias)
+        nn.init.xavier_uniform_(self.fc2.weight)
+        nn.init.zeros_(self.fc2.bias)
+        nn.init.xavier_uniform_(self.fc3.weight)
+        nn.init.zeros_(self.fc3.bias)
+    def forward(self, x):
+        x = F.relu(self.bn1(self.fc1(x)))
+        x = self.dropout(x)
+        x = F.relu(self.bn2(self.fc2(x)))
+        x = self.dropout(x)
+        x = self.bn3(self.fc3(x))
+        return x
+class QuadStreamHLAViT(nn.Module):
+    def __init__(self, num_classes=7):
+        super(QuadStreamHLAViT, self).__init__()
+        self.backbone = EfficientNetBackbone()
+        self.hla = HLA()
+        self.vit = ViT()
+        self.intensity = IntensityStream()
+        self.landmark = LandmarkStream(input_dim=14, embed_dim=768)
+        self.fc_hla = nn.Linear(256*7*7, 768)
+        self.fc_intensity = nn.Linear(256, 768)
+        self.fusion_fc = nn.Linear(768*4, 512)
+        self.bn_fusion = nn.BatchNorm1d(512, eps=1e-5)
+        self.dropout = nn.Dropout(0.6)
+        self.classifier = nn.Linear(512, num_classes)
+        nn.init.xavier_uniform_(self.fc_hla.weight)
+        nn.init.zeros_(self.fc_hla.bias)
+        nn.init.xavier_uniform_(self.fc_intensity.weight)
+        nn.init.zeros_(self.fc_intensity.bias)
+        nn.init.xavier_uniform_(self.fusion_fc.weight)
+        nn.init.zeros_(self.fusion_fc.bias)
+        nn.init.xavier_uniform_(self.classifier.weight)
+        nn.init.zeros_(self.classifier.bias)
+    def forward(self, x, landmark_features, landmark_mask=None):
+        features = self.backbone(x)
+        hla_out = self.hla(features, landmark_mask)
+        vit_out = self.vit(features)
+        intensity_out, grad_magnitude, variance = self.intensity(features)
+        landmark_out = self.landmark(landmark_features)
+        hla_flat = self.fc_hla(hla_out.view(-1, 256*7*7))
+        intensity_flat = self.fc_intensity(intensity_out)
+        fused = torch.cat([hla_flat, vit_out, intensity_flat, landmark_out], dim=1)
+        fused = F.relu(self.fusion_fc(fused))
+        fused = self.bn_fusion(fused)
+        fused = self.dropout(fused)
+        logits = self.classifier(fused)
+        return logits, hla_out, vit_out, grad_magnitude, variance
+# Load model
+model = QuadStreamHLAViT(num_classes=7)
+if os.path.exists(MODEL_WEIGHTS_PATH):
+    try:
+        model.load_state_dict(torch.load(MODEL_WEIGHTS_PATH, map_location=torch.device('cpu'), weights_only=True))
+        print("Model weights loaded successfully.")
+    except Exception as e:
+        print(f"Error loading model weights: {e}")
+else:
+    print(f"Model weights not found at {MODEL_WEIGHTS_PATH}. Please upload the weights.")
+model.eval()
+# Inference function
 def predict_emotion(image):
+    try:
+        # Convert image to RGB
+        if isinstance(image, np.ndarray):
+            image = Image.fromarray(image)
+        image = image.convert("RGB")
+        # Extract landmarks and mask
+        lm_features = extract_landmark_features(image)
+        lm_mask = get_landmark_mask(image)
+        # Transform image
+        img_tensor = transform(image).unsqueeze(0)
+        lm_features_tensor = torch.tensor(lm_features, dtype=torch.float32).unsqueeze(0)
+        # Run inference
+        with torch.no_grad():
+            outputs, _, _, _, _ = model(img_tensor, lm_features_tensor, lm_mask)
+            probs = F.softmax(outputs, dim=1)[0]
+            pred_label = torch.argmax(probs).item()
+            pred_emotion = class_mapping[pred_label]
+        # Format probabilities
+        prob_dict = {class_mapping[i]: f"{probs[i].item():.4f}" for i in range(len(class_mapping))}
+        return pred_emotion, prob_dict
+    except Exception as e:
+        return "Error", {"Message": f"Failed to process image: {str(e)}"}
+# Gradio interface
+iface = gr.Interface(
     fn=predict_emotion,
+    inputs=gr.Image(type="pil", label="Upload an Image"),
+    outputs=[
+        gr.Textbox(label="Predicted Emotion"),
+        gr.JSON(label="Emotion Probabilities")
+    ],
+    title="Facial Emotion Recognition with QuadStreamHLAViT",
+    description="Upload an image to predict facial emotions (Surprise, Fear, Disgust, Happiness, Sadness, Anger, Neutral) using a QuadStreamHLAViT model trained on RAF-DB. Model accuracy: 82.31%.",
+    allow_flagging="never"
+)
+# Launch the app
+if __name__ == "__main__":
+    iface.launch()