Spaces:

COSMOS-Empath-SHP
/

EmpathNet

Sleeping

App Files Files Community

prekshyam commited on Aug 1, 2025

Commit

3114d75

verified ·

1 Parent(s): 6389c8a

app.py

Browse files

Files changed (1) hide show

app.py +149 -0

app.py ADDED Viewed

	@@ -0,0 +1,149 @@

+import gradio as gr
+from maevit import ViTForEmotionClassificationMLP, MAEViT
+import torch
+from torchvision import transforms
+from PIL import Image
+from matplotlib import pyplot as plt
+IMAGE_SIZE = 224
+pt_model_path = 'MAE1.bin'
+ft_model_path='EmotionClassifier1.bin'
+transform = transforms.Compose([
+        transforms.Resize((IMAGE_SIZE, IMAGE_SIZE)),
+        transforms.ToTensor(),
+        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
+])
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+mae_model = MAEViT(
+    image_size=224,
+    patch_size=16,
+    embed_dim=128,
+    encoder_layers=2,
+    encoder_heads=4,
+    mlp_ratio=2.0,
+    mask_ratio=0.75,
+    decoder_embed_dim=64,
+    decoder_layers=2,
+    decoder_heads=4,
+    dropout=0.1
+)
+mae_model.load_state_dict(torch.load(pt_model_path, map_location='cpu'))
+mae_model.eval()
+mae_model.to(device)
+ft_model = ViTForEmotionClassificationMLP(
+    image_size=224,
+    patch_size=16,
+    embed_dim=128,
+    encoder_layers=2,
+    encoder_heads=4,
+    mlp_ratio=2.0,
+    dropout=0.1,
+    num_classes=9,
+) # TODO check
+ft_model.load_state_dict(torch.load(ft_model_path, map_location='cpu'))
+ft_model.eval()
+ft_model.to(device)
+yolo_mapping = {
+    0: "Angry",
+    1: "Contempt",
+    2: "Disgust",
+    3: "Fear",
+    4: "Happy",
+    5: "Natural",
+    6: "Sad",
+    7: "Sleepy",
+    8: "Surprised"
+}
+def mae_reconstruct(image:Image, figure_name='figure/demo_temp.png'):
+    img = transform(image).unsqueeze(0)
+    img = img.to(device)
+    with torch.no_grad():
+        x_enc, mask, ids_restore = mae_model.forward_encoder(img)
+        x_rec_patches = mae_model.forward_decoder(x_enc, ids_restore)
+    img_rec = mae_model.unpatchify(x_rec_patches[:, 1:, :]) # exclude CLS    # [1,3,224,224]
+    img_patches = mae_model.patchify(img)              # [1, num_patches, patch_dim]
+    masked_patches = img_patches.clone()
+    mask = mask.unsqueeze(-1).to(torch.bool)     # [1, num_patches, 1]
+    # masked_patches[mask] = 0
+    masked_patches = masked_patches.masked_fill(mask, 0)
+    img_masked = mae_model.unpatchify(masked_patches) # [1,3,224,224]
+    inv_normalize = transforms.Normalize(
+        mean=[-m/s for m, s in zip((0.485,0.456,0.406),(0.229,0.224,0.225))],
+        std =[1/s for s in       (0.229,0.224,0.225)]
+    )
+    def to_img(tensor):
+        img = tensor.squeeze(0).cpu()
+        img = inv_normalize(img)
+        img = img.permute(1,2,0).clamp(0,1).numpy()
+        return img
+    orig_np     = to_img(img)
+    masked_np   = to_img(img_masked)
+    recon_np    = to_img(img_rec)
+    # fig, axes = plt.subplots(1, 3, figsize=(15,5))
+    # for ax, im, title in zip(axes,
+    #                          [orig_np, masked_np, recon_np],
+    #                          ['Original', 'Masked Input', 'Reconstruction']):
+    #     ax.imshow(im)
+    #     ax.set_title(title)
+    #     ax.axis('off')
+    # plt.tight_layout()
+    # plt.show()
+    # plt.savefig(figure_name)
+    # TODO
+    # how to return the reconstructed image?
+    # return the reconstructed image as a numpy array
+    return masked_np, recon_np
+def classify(image:Image):
+    img = transform(image).unsqueeze(0)
+    img = img.to(device)
+    with torch.no_grad():
+        logits = ft_model(img)
+        probs = logits.softmax(dim=-1)
+        predicted_class = probs.argmax(dim=-1).item()
+        predicted_labels = yolo_mapping[int(predicted_class)]
+    return predicted_labels
+def predict(image:Image):
+    """
+    takes PIL image and return reconstructed image and predicted emotion label
+    """
+    masked_image, re_image = mae_reconstruct(image, figure_name='figure/demo_temp.png')
+    predicted_labels = classify(image)
+    return masked_image, re_image, predicted_labels
+gr.Interface(
+    fn=predict,
+    inputs=gr.Image(type='pil', label='Input Image'),
+    outputs=[
+        gr.Image(type='numpy', label='Randomly Masked Image'),
+        gr.Image(type='numpy', label='Reconstructed Image'),
+        gr.Textbox(label='Predicted Emotion')
+    ],
+    title="Emotion Recognition and MAE Reconstruction",
+    description="Upload an image to see the reconstructed image (by MAE) and the predicted emotion label."
+).launch(share=True, debug=True)