Spaces:

NYU-DS-4-Everyone
/

face

Build error

App Files Files Community

Amanuel-Ni commited on May 7, 2025

Commit

d797315

verified ·

1 Parent(s): 6bfa1cb

Upload 12 files

Browse files

Files changed (13) hide show

.gitattributes +3 -0
Convolutional-Neural-Network.jpg +3 -0
README.md +19 -12
Training.py +0 -0
app.py +74 -0
data_viz.py +117 -0
emo.jpg +0 -0
introduction.py +121 -0
models.py +420 -0
new41.jpg +3 -0
prediction.py +166 -0
requirements.txt +8 -0
vit.jpg +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+Convolutional-Neural-Network.jpg filter=lfs diff=lfs merge=lfs -text
+new41.jpg filter=lfs diff=lfs merge=lfs -text
+vit.jpg filter=lfs diff=lfs merge=lfs -text

Convolutional-Neural-Network.jpg ADDED Viewed

Git LFS Details

SHA256: 43cd61b6f706b6c9749240ee8ae75ec1b4ec34ab85beefedd0a1e5adca4e811a
Pointer size: 131 Bytes
Size of remote file: 126 kB

README.md CHANGED Viewed

@@ -1,12 +1,19 @@
----
-title: Face
-emoji: 🏃
-colorFrom: pink
-colorTo: purple
-sdk: streamlit
-sdk_version: 1.44.1
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# :earth_americas: GDP dashboard template
+A simple Streamlit app showing the GDP of different countries in the world.
+[![Open in Streamlit](https://static.streamlit.io/badges/streamlit_badge_black_white.svg)](https://gdp-dashboard-template.streamlit.app/)
+### How to run it on your own machine
+1. Install the requirements
+   ```
+   $ pip install -r requirements.txt
+   ```
+2. Run the app
+   ```
+   $ streamlit run streamlit_app.py
+   ```

Training.py ADDED Viewed

File without changes

app.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import streamlit as st
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from PIL import Image
+import numpy as np
+from pytorch_grad_cam.utils.image import show_cam_on_image
+import random
+import matplotlib.pyplot as plt
+import torchvision.transforms as transforms
+import cv2
+#import modules from this repository
+import models
+import prediction
+import data_viz
+import introduction
+# -------------------------------
+# Label Dictionary (1-indexed)
+# -------------------------------
+label_dict = {
+    1: 'Surprise',
+    2: 'Disgust',
+    3: 'Happiness',
+    4: 'Sadness',
+    5: 'Anger',
+    6: 'Neutral'
+}
+# -------------------------------
+# Streamlit App UI
+# -------------------------------
+st.set_page_config(page_title="Emotion Classification With Computer Vision", layout="centered")
+st.title("🎭 Facial Expression Recognition")
+# Model selection
+model_choice = st.selectbox("Choose a model", ["CNN", "VGG16", "ViT"])
+model = models.load_cnn_model()
+app_mode = st.sidebar.selectbox('Contents ',['01 Introduction','02 Data visualization','03 Prediction'])
+if app_mode == '01 Introduction':
+    introduction.Show_introduction()
+elif app_mode == '02 Data visualization':
+    data_viz.data_visualization(model_choice)
+else:
+    prediction.Display_prediction(model_choice,label_dict)

data_viz.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import streamlit as st
+import matplotlib.pyplot as plt
+from torchinfo import summary
+def data_visualization(model_choice):
+    def plot_model_metrics(model_type):
+        # Epochs are the same for all models
+        epochs = list(range(1, 11))
+        if model_type == 'CNN':
+            loss = [
+                1.2389, 0.9501, 0.8562, 0.7818, 0.7231, 0.6737, 0.6299, 0.5995, 0.5672, 0.5388,
+                0.4650, 0.4469, 0.4263, 0.4116, 0.3860, 0.3775, 0.3621, 0.3455, 0.3250, 0.3098,
+                0.2803, 0.2633, 0.2520, 0.2465, 0.2436, 0.2339, 0.2197, 0.2168, 0.2097, 0.2021
+            ]
+            accuracy = [
+                53.36, 65.29, 69.07, 71.31, 73.55, 75.41, 77.23, 78.31, 79.55, 80.71,
+                83.26, 84.22, 85.00, 85.37, 86.30, 86.78, 87.03, 87.60, 88.68, 89.18,
+                90.26, 91.04, 91.20, 91.32, 91.74, 92.02, 92.82, 92.50, 93.00, 93.25
+            ]
+        elif model_type == 'VGG16':
+            loss = [1.2832, 0.8841, 0.7730, 0.7002, 0.6222, 0.5854, 0.5632, 0.5135, 0.4946, 0.4537]
+            accuracy = [56.87, 68.18, 72.45, 75.38, 78.16, 79.32, 80.08, 82.19, 82.61, 84.26]
+        elif model_type == 'ViT':
+            loss = [186.7186, 176.4275, 116.8164, 159.8890, 151.8824, 151.6594, 146.9743, 143.7478, 140.8833, 138.7943]
+            accuracy = [63.57, 65.16, 66.85, 68.92, 70.29, 71.09, 71.87, 72.54, 73.11, 73.92]
+        else:
+            st.error("Model type must be one of: CNN, VGG16, ViT")
+            return
+        # Set epochs to match the loss list length
+        epochs = list(range(1, len(loss) + 1))
+        # Plot both Loss and Accuracy
+        fig, axs = plt.subplots(1, 2, figsize=(12, 4))
+        axs[0].plot(epochs, loss, marker='o', color='tomato')
+        axs[0].set_title(f"{model_type} - Loss")
+        axs[0].set_xlabel("Epoch")
+        axs[0].set_ylabel("Loss")
+        axs[0].grid(True)
+        axs[1].plot(epochs, accuracy, marker='o', color='seagreen')
+        axs[1].set_title(f"{model_type} - Accuracy")
+        axs[1].set_xlabel("Epoch")
+        axs[1].set_ylabel("Accuracy (%)")
+        axs[1].grid(True)
+        st.pyplot(fig)
+    # Streamlit UI
+    st.title("Model Training Metrics Viewer")
+    plot_model_metrics(model_choice)
+    st.subheader("📊 Model Architecture Summary")
+    st.markdown("This section provides a detailed breakdown of the model architecture, including the number of parameters, trainability, and estimated model size.")
+    if model_choice == "CNN":
+        st.markdown("""
+    #### 🤖 CNN Architecture (`FacialReaction`)
+    | Layer              | Input Shape       | Output Shape      | Params     | Trainable |
+    |-------------------|-------------------|-------------------|------------|-----------|
+    | Conv2d (conv1)    | [1, 3, 100, 100]  | [1, 64, 99, 99]   | 3,136      | ✅        |
+    | MaxPool2d         | [1, 64, 99, 99]   | [1, 64, 49, 49]   | -          | ❌        |
+    | Conv2d (conv2)    | [1, 64, 49, 49]   | [1, 64, 48, 48]   | 65,600     | ✅        |
+    | MaxPool2d         | [1, 64, 48, 48]   | [1, 64, 24, 24]   | -          | ❌        |
+    | Linear (fc1)      | [1, 36864]        | [1, 128]          | 4,718,720  | ✅        |
+    | Linear (fc2)      | [1, 128]          | [1, 6]            | 774        | ✅        |
+    **Total Parameters**: `4,788,230`
+    **Trainable Parameters**: `4,788,230`
+    **Non-trainable Parameters**: `0`
+    **Estimated Model Size**: `~25.5 MB`
+    """)
+    elif model_choice == "ViT":
+        st.markdown("""
+    #### 🧠 Vision Transformer (ViT) Architecture
+    | Component                       | Input Shape        | Output Shape       | Params     | Trainable |
+    |--------------------------------|--------------------|--------------------|------------|-----------|
+    | Patch Embedding (Conv2d)       | [32, 3, 224, 224]  | [32, 192, 14, 14]  | 147,648    | ✅        |
+    | Transformer Blocks (12x)       | [32, 197, 192]     | [32, 197, 192]     | ~5.3M      | ✅        |
+    | Classification Head (fc_out)   | [32, 192]          | [32, 6]            | 1,158      | ✅        |
+    **Total Parameters**: `5,526,348`
+    **Trainable Parameters**: `5,526,348`
+    **Non-trainable Parameters**: `0`
+    **Estimated Model Size**: `~1.3 GB`
+    """)
+    elif model_choice == "VGG16":
+        st.markdown("""
+    #### 📦 VGG16 Model Summary
+    VGG16 is a deep convolutional neural network known for its uniform architecture of `3x3` conv filters and `2x2` max-pooling layers. It ends with 3 fully connected layers.
+    Since it's quite large (138 million parameters), we’re showing a high-level overview here:
+    | Component            | Description                        |
+    |---------------------|------------------------------------|
+    | Convolutional Blocks| 13 Conv layers + ReLU + MaxPooling |
+    | Fully Connected     | FC1 → FC2 → Output layer (6 units) |
+    | Pretrained Base     | Yes (ImageNet, fine-tuned)         |
+    **Total Parameters**: ~`138 million`
+    **Trainable Parameters**: ~`138 million`
+    **Estimated Model Size**: ~`500+ MB`
+    """)

emo.jpg ADDED Viewed

introduction.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import streamlit as st
+from PIL import Image
+def Show_introduction():
+    # Set page configuration
+    # Load and display the image
+    image = Image.open("emo.jpg")
+    # Center the image using columns
+    col1, col2, col3 = st.columns([1, 2, 1])
+    with col2:
+        st.image(image, use_container_width=True)
+    # Title and subtitle
+    st.title("🎭 Facial Emotion Recognition")
+    st.subheader("Detecting Emotions from Facial Expressions Using Deep Learning")
+    # Introduction text
+    st.markdown("""
+    Welcome to the **Face Emotion Recognition** app!
+    This project demonstrates the use of deep learning to recognize human emotions from facial expressions in real time.
+    Using a convolutional neural network (CNN) trained on facial image datasets, the model can classify emotions such as **Happy**, **Sad**, **Angry**, **Surprised**, and more.
+    """)
+    # Add a separator
+    st.markdown("---")
+    # Motivation section
+    st.header("💡 Motivation")
+    st.markdown("""
+    Facial expressions are a fundamental mode of non-verbal communication.
+    With the rise of AI and human-computer interaction, emotion recognition has gained importance in applications such as:
+    - Mental health monitoring
+    - Customer feedback analysis
+    - Security and surveillance systems
+    - Interactive gaming and virtual assistants
+    """)
+    # Objective section
+    st.header("🎯 Objective")
+    st.markdown("""
+    The goal of this project is to:
+    - Build a robust deep learning model that can accurately classify emotions from facial images.
+    - Deploy the model in a user-friendly interface for real-time predictions.
+    - Explore how AI can understand human affect through facial features.
+    """)
+    # How it works section
+    st.header("⚙️ How It Works")
+    st.markdown("""
+    1. Upload an image or use your webcam to capture a face.
+    2. The model detects the face and analyzes facial features.
+    3. It then predicts the most likely emotion and displays the result.
+    This app was built with **Streamlit**, and **PyTorch**.
+    """)
+    # Model Overview section
+    st.header("🧠 Models Used")
+    # Dropdown for model selection
+    model_choice = st.selectbox(
+        "Select a model to learn more about it:",
+        ["Convolutional Neural Network (CNN)", "Vision Transformer (ViT)", "VGG"]
+    )
+    if model_choice == "Convolutional Neural Network (CNN)":
+        st.subheader("🌀 Convolutional Neural Network (CNN)")
+        cnn_image = Image.open("Convolutional-Neural-Network.jpg")  # Replace with your actual image file
+        st.image(cnn_image, caption="Typical CNN architecture", use_container_width=True)
+        st.markdown("""
+        CNNs are specialized deep learning models for image processing.
+        They consist of layers that automatically learn to detect features like edges, textures, and patterns in images.
+        ### 📍 Where It's Used:
+        - **Face recognition systems** (e.g., in mobile phones)
+        - **Medical imaging** (e.g., detecting tumors)
+        - **Autonomous vehicles** (e.g., recognizing road signs and pedestrians)
+        In our project, CNNs serve as a baseline for detecting emotions from faces due to their efficiency and interpretability. They are especially good when dealing with relatively smaller datasets.
+        """)
+    elif model_choice == "Vision Transformer (ViT)":
+        st.subheader("🧠 Vision Transformer (ViT)")
+        vit_image = Image.open("vit.jpg")  # Replace with your actual image file
+        st.image(vit_image, caption="Vision Transformer concept", use_container_width=True)
+        st.markdown("""
+        ViTs bring the power of transformer models to the vision domain by splitting images into patches and processing them using self-attention — a technique originally used in NLP.
+        ### 📍 Where It's Used:
+        - **Large-scale image classification** (e.g., ImageNet tasks)
+        - **Fine-grained object detection**
+        - **Art analysis and medical diagnosis**
+        In our app, ViT is used for capturing global relationships in facial features that might not be easily detected by CNNs. It's especially effective with high-resolution images and large training sets.
+        """)
+    elif model_choice == "VGG":
+        st.subheader("🏗️ VGG Network")
+        vgg_image = Image.open("new41.jpg")  # Replace with your actual image file
+        st.image(vgg_image, caption="VGG architecture overview", use_container_width=True)
+        st.markdown("""
+        The VGG model, introduced by the Visual Geometry Group at Oxford, is known for its deep yet simple architecture using small (3x3) convolution filters.
+        ### 📍 Where It's Used:
+        - **Facial recognition systems**
+        - **Emotion detection**
+        - **Transfer learning tasks**, where VGG is pre-trained on large datasets like ImageNet and fine-tuned for specific applications.
+        We use VGG as a benchmark in our system. While it's more computationally intensive than CNN, it performs well when high accuracy is prioritized over speed.
+        """)
+    # Footer or next step
+    st.markdown("---")
+    st.info("👉 Use the sidebar to get started and test the model with your own images or webcam.")

models.py ADDED Viewed

	@@ -0,0 +1,420 @@

+import streamlit as st
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchvision import models, transforms
+from PIL import Image
+from pytorch_grad_cam import GradCAM
+from pytorch_grad_cam.utils.image import show_cam_on_image
+import numpy as np
+# -------------------------------
+# Label Dictionary (1-indexed)
+# -------------------------------
+label_dict = {
+    1: 'Surprise',
+    2: 'Disgust',
+    3: 'Happiness',
+    4: 'Sadness',
+    5: 'Anger',
+    6: 'Neutral'
+}
+import torch
+from torch.utils.data import DataLoader
+from torchvision import datasets, transforms
+# Parameters
+batch_size = 64
+img_size = 100  # Updated from 48 to 100
+# Transforms for CNN
+transform_train = transforms.Compose([
+    transforms.Resize((img_size, img_size)),           # Resize to 100x100
+    transforms.RandomHorizontalFlip(),                 # Data augmentation
+    transforms.RandomRotation(degrees=10),             # Data augmentation
+    transforms.ToTensor(),                             # Convert to tensor
+    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))  # Normalize RGB channels
+])
+transform_test = transforms.Compose([
+    transforms.Resize((img_size, img_size)),
+    transforms.ToTensor(),
+    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
+])
+# Transforms for VGG and ViT
+transform = transforms.Compose([
+    transforms.Resize((224, 224)),      # Resize to 224x224
+    transforms.ToTensor(),              # Convert to tensor [0,1]
+    # transforms.RandomRotation(9),
+    transforms.Normalize(               # Normalize using ImageNet stats
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]
+    )
+])
+# Then unzip it
+import zipfile
+import os
+with zipfile.ZipFile('dataset_final.zip', 'r') as zip_ref:
+    zip_ref.extractall('Dataset_final')
+#datasets
+train_dataset_cnn = datasets.ImageFolder(root='Dataset_final/train', transform=transform_train)
+test_dataset_cnn = datasets.ImageFolder(root='Dataset_final/test', transform=transform_test)
+train_dataset_v = datasets.ImageFolder(root='Dataset_final/train', transform=transform)
+test_dataset_v = datasets.ImageFolder(root='Dataset_final/test', transform=transform)
+# DataLoaders
+train_loader_cnn = DataLoader(train_dataset_cnn, batch_size=batch_size, shuffle=True, num_workers=2)
+test_loader_cnn = DataLoader(test_dataset_cnn,  batch_size=batch_size, shuffle=False, num_workers=2)
+train_loader_v = DataLoader(train_dataset_v, batch_size=batch_size, shuffle=True, num_workers=2)
+test_loader_v = DataLoader(test_dataset_v, batch_size=batch_size, shuffle=False, num_workers=2)
+# -------------------------------
+# Model: CNN (your custom model)
+# -------------------------------
+class FacialReaction(nn.Module):
+    def __init__(self, num_classes=7):
+        super(FacialReaction, self).__init__()
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=4, padding=1)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(64, 64, kernel_size=4, padding=1)
+        self.fc1 = nn.Linear(64 * 24 * 24, 128)
+        self.fc2 = nn.Linear(128, num_classes)
+    def forward(self, x):
+        x = self.pool(F.relu(self.conv1(x)))
+        x = self.pool(F.relu(self.conv2(x)))
+        x = x.view(x.size(0), -1)
+        x = F.relu(self.fc1(x))
+        return self.fc2(x)
+class PatchEmbed(nn.Module):
+    def __init__(self, img_size=224, patch_size=16, in_channels=3, embed_dim=192):
+        super(PatchEmbed, self).__init__()
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.embed_dim = embed_dim
+        # Create a convolutional layer for patch embedding
+        self.proj = nn.Conv2d(in_channels, embed_dim, kernel_size=patch_size, stride=patch_size)
+    def forward(self, x):
+        x = self.proj(x)  # (B, embed_dim, H/patch_size, W/patch_size)
+        x = x.flatten(2)  # Flatten (B, embed_dim, N_patches)
+        x = x.transpose(1, 2)  # (B, N_patches, embed_dim)
+        return x
+class MultiHeadSelfAttention(nn.Module):
+    def __init__(self, embed_dim, num_heads):
+        super(MultiHeadSelfAttention, self).__init__()
+        self.num_heads = num_heads
+        self.embed_dim = embed_dim
+        self.head_dim = embed_dim // num_heads
+        assert self.head_dim * num_heads == embed_dim, "Embedding dimension must be divisible by num_heads"
+        self.qkv = nn.Linear(embed_dim, embed_dim * 3)
+        self.fc_out = nn.Linear(embed_dim, embed_dim)
+    def forward(self, x):
+        B, N, E = x.shape
+        # Linear transformation to get queries, keys and values
+        qkv = self.qkv(x)
+        qkv = qkv.reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        # Attention calculation
+        energy = torch.einsum("nqhd,nkhd->nhqk", [q, k])
+        attention = torch.softmax(energy / (self.embed_dim ** 0.5), dim=-1)
+        out = torch.einsum("nhql,nlhd->nqhd", [attention, v]).reshape(B, N, E)
+        out = self.fc_out(out)
+        return out
+class FeedForward(nn.Module):
+    def __init__(self, embed_dim, hidden_dim=768):
+        super(FeedForward, self).__init__()
+        self.fc1 = nn.Linear(embed_dim, hidden_dim)
+        self.fc2 = nn.Linear(hidden_dim, embed_dim)
+        self.dropout = nn.Dropout(0.1)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = F.gelu(x)
+        x = self.dropout(x)
+        x = self.fc2(x)
+        return x
+class TransformerBlock(nn.Module):
+    def __init__(self, embed_dim, num_heads, hidden_dim=768):
+        super(TransformerBlock, self).__init__()
+        self.attn = MultiHeadSelfAttention(embed_dim, num_heads)
+        self.ffn = FeedForward(embed_dim, hidden_dim)
+        self.layernorm1 = nn.LayerNorm(embed_dim)
+        self.layernorm2 = nn.LayerNorm(embed_dim)
+    def forward(self, x):
+        attn_out = self.attn(x)
+        x = self.layernorm1(x + attn_out)  # Add & Norm
+        ffn_out = self.ffn(x)
+        x = self.layernorm2(x + ffn_out)  # Add & Norm
+        return x
+class VisionTransformer(nn.Module):
+    def __init__(self, img_size=224, patch_size=16, in_channels=3, embed_dim=192, num_heads=3, num_layers=12, num_classes=6):
+        super(VisionTransformer, self).__init__()
+        self.embed_dim = embed_dim
+        # Patch Embedding
+        self.patch_embed = PatchEmbed(img_size, patch_size, in_channels, embed_dim)
+        # Positional Encoding
+        self.pos_embed = nn.Parameter(torch.randn(1, (img_size // patch_size) ** 2 + 1, embed_dim))
+        self.cls_token = nn.Parameter(torch.randn(1, 1, embed_dim))
+        # Transformer Blocks
+        self.blocks = nn.ModuleList([
+            TransformerBlock(embed_dim, num_heads) for _ in range(num_layers)
+        ])
+        # MLP Head for classification
+        self.fc_out = nn.Linear(embed_dim, num_classes)
+    def forward(self, x):
+        # Embed the image into patches
+        x = self.patch_embed(x)
+        # Add class token to the sequence
+        batch_size = x.size(0)
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+        x = torch.cat((cls_tokens, x), dim=1)  # (B, N_patches+1, embed_dim)
+        # Add positional encoding
+        x = x + self.pos_embed
+        # Pass through transformer blocks
+        for block in self.blocks:
+            x = block(x)
+        # Classification head
+        cls_output = x[:, 0]  # Extract the class token output
+        out = self.fc_out(cls_output)
+        return out
+# -------------------------------
+# Load Model Functions (correct filenames)
+# -------------------------------
+@st.cache_resource
+def load_cnn_model():
+    model = FacialReaction(num_classes=6)
+    # Load full checkpoint
+    checkpoint = torch.load('CNN_facial_reaction.pth',map_location='cpu')
+    # Load only the model weights
+    model.load_state_dict(checkpoint['model_state_dict'])
+    model.eval()
+    return model
+@st.cache_resource
+def load_vgg_model():
+    model = models.vgg16(pretrained=False)
+    model.classifier[6] = nn.Linear(4096, 6)
+    model.load_state_dict(torch.load("vgg_dataset2_84_74.pth", map_location='cpu'))
+    model.eval()
+    return model
+@st.cache_resource
+def load_vit_model():
+    model = VisionTransformer()
+    model.heads = nn.Sequential(nn.Linear(192,6))
+    checkpoint = torch.load('vit_70_67.pth', map_location='cpu')
+    model.load_state_dict(checkpoint['model_state_dict'])
+    model.eval()
+    return model
+# -------------------------------
+# Preprocess Webcam Image
+# -------------------------------
+def preprocess_image(img: Image.Image,model_type='CNN'):
+    if model_type == 'CNN':
+        transform = transforms.Compose([
+            transforms.Resize((100, 100)),
+            transforms.ToTensor(),
+            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
+        ])
+        return transform(img).unsqueeze(0)  # [1, 3, 100, 100]
+    else:
+        transform = transforms.Compose([
+            transforms.Resize((224, 224)),      # Resize to 224x224
+            transforms.ToTensor(),              # Convert to tensor [0,1]
+            # transforms.RandomRotation(9),
+            transforms.Normalize(               # Normalize using ImageNet stats
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]
+            )
+        ])
+        return transform(img).unsqueeze(0)  # [1, 3, 224, 224]
+def apply_gradcam_streamlit(model, input_tensor, target_layer, class_names=None, true_label=None):
+    """
+    Applies Grad-CAM on a given image tensor and returns:
+      - Original image
+      - Original + Grad-CAM overlay
+    Args:
+    - model: Trained CNN/VGG/ViT model.
+    - input_tensor: A single image tensor (1, 3, H, W).
+    - target_layer: Target layer for Grad-CAM.
+    - class_names: Optional dict mapping class indices to names.
+    - true_label: Optional integer ground-truth label (1-indexed).
+    Returns:
+    - Tuple of original image and Grad-CAM overlay (both as NumPy arrays)
+    """
+    model.eval()
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    model = model.to(device)
+    input_tensor = input_tensor.to(device)
+    # GradCAM setup
+    cam = GradCAM(model=model, target_layers=[target_layer])
+    targets = None
+    # Run inference
+    outputs = model(input_tensor)
+    _, predicted = outputs.max(1)
+    predicted_label = predicted.item() + 1  # shift from 0–5 to 1–6
+    # Grad-CAM computation
+    grayscale_cam = cam(input_tensor=input_tensor)[0, :]  # (H, W)
+    # Unnormalize and prepare original image
+    img_disp = input_tensor.squeeze(0).cpu()
+    img_disp = img_disp * 0.5 + 0.5  # Assuming normalization was [-1, 1]
+    img_disp = img_disp.permute(1, 2, 0).numpy()  # (H, W, C)
+    # Create heatmap image
+    heatmap_image = show_cam_on_image(img_disp, grayscale_cam, use_rgb=True)
+    # Return both images for display
+    return img_disp, heatmap_image, predicted_label

new41.jpg ADDED Viewed

Git LFS Details

SHA256: 4886bad3ea1ab441f5bccb1855e5ca4bdb1d70e6fd4aa040a67824c70ebf540e
Pointer size: 131 Bytes
Size of remote file: 246 kB

prediction.py ADDED Viewed

	@@ -0,0 +1,166 @@

+import streamlit as st
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from PIL import Image
+import numpy as np
+from pytorch_grad_cam.utils.image import show_cam_on_image
+import random
+import matplotlib.pyplot as plt
+import torchvision.transforms as transforms
+import cv2
+import models
+def Display_prediction(model_choice, label_dict,):
+    # Camera input
+    img_file = st.camera_input("📸 Take a photo to classify")
+    # Load the selected model
+    if model_choice == "CNN":
+        model = models.load_cnn_model()
+    elif model_choice == "VGG16":
+        model = models.load_vgg_model()
+    else:
+        model = models.load_vit_model()
+    if img_file is not None:
+        image = Image.open(img_file)
+        st.image(image, caption="Captured Image", use_container_width=True)
+        input_tensor = models.preprocess_image(image, model_type=model_choice)
+        # Inference
+        with torch.no_grad():
+            outputs = model(input_tensor)
+            _, predicted = torch.max(outputs, 1)
+            predicted_label = predicted.item() + 1
+        st.success(f"🧠 Predicted Emotion: **{label_dict[predicted_label]}**")
+        if model_choice == "CNN":
+            target_layer = model.conv2  # Adjust to your CNN
+            # Grad-CAM
+            orig, gradcam_img, pred_label = models.apply_gradcam_streamlit(
+                model=model,
+                input_tensor=input_tensor,
+                target_layer=target_layer,
+                class_names=label_dict,
+                true_label=None
+            )
+            st.subheader("🧠 Grad-CAM Visualization")
+            st.image(gradcam_img, caption=f"Grad-CAM Heatmap: {label_dict[pred_label]}", use_column_width=True)
+            # Convert both images to PIL
+            orig_img_pil = Image.fromarray((orig * 255).astype(np.uint8))
+            heatmap_img_pil = Image.fromarray(gradcam_img)
+            # Side-by-side view
+            st.image([orig_img_pil, heatmap_img_pil], caption=["Original", "Grad-CAM"], width=300)
+    st.title("🧠 Random Test Image Prediction")
+    if st.button("🎲 Show Random Prediction"):
+        model.eval()
+        test_dataset = models.test_dataset_cnn
+        if model_choice != "CNN":
+            test_dataset = models.test_dataset_v
+        # Pick a truly random image from the whole dataset
+        index_to_label = {i: int(cls) for i, cls in enumerate(test_dataset.classes)}  # test_dataset.classes should be strings like ['1', '2', ..., '6']
+        total_samples = len(test_dataset)
+        rand_index = random.randint(0, total_samples - 1)
+        # Load image and label directly
+        image, label = test_dataset[rand_index]
+        input_tensor = image.unsqueeze(0)  # Add batch dimension
+        # Run prediction
+        model.eval()
+        with torch.no_grad():
+            output = model(input_tensor)
+            _, predicted = torch.max(output, 1)
+        # Convert class index (0-based) to folder label (1-based)
+        true_label = int(test_dataset.classes[label])
+        predicted_label = int(test_dataset.classes[predicted.item()])
+        # Convert image for display
+        image_disp = image.permute(1, 2, 0).cpu().numpy()
+        image_disp = image_disp * 0.5 + 0.5  # unnormalize
+        image_disp = np.clip(image_disp, 0, 1)
+        # Display image using Matplotlib
+        fig, ax = plt.subplots(figsize=(6, 6))
+        ax.imshow(image_disp)
+        ax.set_title(f"✅ True: {label_dict[true_label]}\n🤖 Predicted: {label_dict[predicted_label]}")
+        ax.axis("off")
+        st.pyplot(fig)
+        if model_choice == "CNN":
+            # ----------------------------
+            # 🧠 Apply Grad-CAM on Selected Random Image
+            # ----------------------------
+            target_layer = model.conv2
+            # Prepare the single image tensor for Grad-CAM
+            input_tensor = image.unsqueeze(0)
+            # Grad-CAM
+            img_disp, gradcam_overlay, _ = models.apply_gradcam_streamlit(
+                model=model,
+                input_tensor=input_tensor,
+                target_layer=target_layer,
+                class_names=label_dict,
+                true_label=true_label
+            )
+            st.subheader("🔥 Grad-CAM on Random Test Image")
+            # Convert both to displayable format
+            orig_pil = Image.fromarray((img_disp * 255).astype(np.uint8))
+            heatmap_pil = Image.fromarray(gradcam_overlay)
+            # Side-by-side in Streamlit
+            st.image([orig_pil, heatmap_pil], caption=["Original", "Grad-CAM"], width=300)

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+streamlit
+torch
+torchvision
+matplotlib
+pytorch-grad-cam
+numpy

vit.jpg ADDED Viewed

Git LFS Details

SHA256: ded00c17f00ead2c17811e3a396a43d76beca9290f30c5aa6c40d52030f7cd3a
Pointer size: 131 Bytes
Size of remote file: 171 kB