face-emotion-detection / comparison.py
kalpitsharma's picture
Upload 26 files
93211ad verified
import os
import numpy as np
import torch
import cv2
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from skimage.feature import hog
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from sklearn.preprocessing import LabelEncoder
from transformers import CLIPProcessor, CLIPModel
from transformers import ViTForImageClassification, ViTFeatureExtractor
from skimage.color import rgb2gray
from skimage.feature import hog
import numpy as np
from PIL import Image
# ----------------------
# 1. Load and Preprocess Custom Dataset
# ----------------------
def load_custom_dataset(dataset_path, image_size=(48, 48)):
images = []
labels = []
label_map = {}
label_idx = 0
# Loop through the dataset folders (each folder is an emotion class)
for folder_name in os.listdir(dataset_path):
folder_path = os.path.join(dataset_path, folder_name)
# Ignore non-directories (i.e., files)
if not os.path.isdir(folder_path):
continue
# Assign an integer label to each emotion
if folder_name not in label_map:
label_map[folder_name] = label_idx
label_idx += 1
# Load images and labels
for img_name in os.listdir(folder_path):
img_path = os.path.join(folder_path, img_name)
if img_path.endswith('.jpg') or img_path.endswith('.png'):
img = load_img(img_path, target_size=image_size, color_mode='grayscale')
img_array = img_to_array(img)
images.append(img_array)
labels.append(label_map[folder_name])
# Convert images and labels to numpy arrays
images = np.array(images, dtype="float32") / 255.0 # Normalize
labels = np.array(labels)
return images, labels, label_map
dataset_path = 'train' # Replace with the path to your dataset
x, y, label_map = load_custom_dataset(dataset_path)
# Print class mapping for reference
print("Class labels:", label_map)
# ----------------------
# 2. HOG + SVM (Classical ML)
# ----------------------
def extract_hog_features(images):
# Check if the image is grayscale (2D) or RGB (3D)
# If grayscale (shape (H, W, 1)), squeeze it to (H, W) for processing
grayscale_images = [img.squeeze() if len(img.shape) == 3 else img for img in images]
# Extract HOG features from grayscale images
return np.array([hog(img, pixels_per_cell=(8, 8), cells_per_block=(2, 2)) for img in grayscale_images])
# Assuming your images are already in grayscale, so you can skip rgb2gray for grayscale images.
# def extract_hog_features(images):
# return np.array([hog(img, pixels_per_cell=(8, 8), cells_per_block=(2, 2)) for img in images])
print("[INFO] Extracting HOG features...")
x_hog = extract_hog_features(x)
x_train_hog, x_test_hog, y_train_hog, y_test_hog = train_test_split(x_hog, y, test_size=0.2, random_state=42)
print("[INFO] Training SVM classifier...")
svm = SVC(kernel='linear')
svm.fit(x_train_hog, y_train_hog)
y_pred_svm = svm.predict(x_test_hog)
print("\n[RESULTS] SVM Classification Report")
print(classification_report(y_test_hog, y_pred_svm))
# ----------------------
# 3. CNN (Deep Learning)
# ----------------------
y_cnn = to_categorical(y, num_classes=len(label_map))
x_train_cnn, x_test_cnn, y_train_cnn, y_test_cnn = train_test_split(x, y_cnn, test_size=0.2, random_state=42)
print("[INFO] Building CNN model...")
cnn = Sequential([
Conv2D(32, (3,3), activation='relu', input_shape=(48, 48, 1)),
MaxPooling2D(2,2),
Conv2D(64, (3,3), activation='relu'),
MaxPooling2D(2,2),
Flatten(),
Dense(128, activation='relu'),
Dense(len(label_map), activation='softmax')
])
cnn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
cnn.fit(x_train_cnn, y_train_cnn, epochs=3, batch_size=64, validation_split=0.1)
print("[INFO] Evaluating CNN...")
loss, acc = cnn.evaluate(x_test_cnn, y_test_cnn)
print(f"CNN Accuracy: {acc * 100:.2f}%")
# ----------------------
# 4. CLIP (Vision-Language) Comparison
# ----------------------
# Load the CLIP model and processor from Huggingface
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
# Example usage of CLIP for image processing
# image_input = torch.tensor(x[0]).unsqueeze(0) # Convert to tensor for CLIP input, add batch dimension
# text_input = ["Emotion description of the image"] # Provide a text description for comparison
# # Process the inputs with the CLIP processor
# inputs = clip_processor(text=text_input, images=image_input, return_tensors="pt", padding=True)
from PIL import Image
# Convert your NumPy grayscale image to a 3-channel RGB PIL image and resize it
gray_img = (x[0] * 255).astype(np.uint8).squeeze() # shape: (48, 48)
rgb_img = np.stack([gray_img]*3, axis=-1) # Convert to RGB shape: (48, 48, 3)
pil_img = Image.fromarray(rgb_img).resize((224, 224)) # Resize for CLIP input
# Prepare text input
text_input = ["Emotion description of the image"]
# Process using CLIPProcessor
inputs = clip_processor(text=text_input, images=pil_img, return_tensors="pt", padding=True)
# Get model predictions (outputs)
outputs = clip_model(**inputs)
# CLIP similarity comparison (text vs image)
logits_per_image = outputs.logits_per_image # similarity score
logits_per_text = outputs.logits_per_text # similarity score
print("\n[RESULTS] CLIP Similarity Scores")
print(f"Logits per image: {logits_per_image}")
print(f"Logits per text: {logits_per_text}")
# ----------------------
# 5. Vision Transformer (ViT) Comparison
# ----------------------
# Load ViT model and feature extractor for image classification
vit_model = ViTForImageClassification.from_pretrained("google/vit-base-patch16-224-in21k")
vit_feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
# Process image and get predictions
inputs_vit = vit_feature_extractor(x[0], return_tensors="pt")
outputs_vit = vit_model(**inputs_vit)
print("\n[RESULTS] ViT Classification Scores")
print(f"ViT logits: {outputs_vit.logits}")
# ----------------------
# 6. Comparison Results Visualization
# ----------------------
# Prepare to visualize the comparisons
plt.figure(figsize=(12, 6))
# Plot Sample Image
plt.subplot(131)
plt.title("Sample Image")
plt.imshow(x[0], cmap='gray')
# Plot HOG Feature Visualization
plt.subplot(132)
plt.title("HOG Feature Visualization")
hog_img = hog(x[0], visualize=True)[1]
plt.imshow(hog_img, cmap='gray')
# Placeholder for CNN's evaluation
plt.subplot(133)
plt.title("CNN Evaluation")
cnn_img = cnn.predict(np.expand_dims(x[0], axis=0))
plt.imshow(cnn_img[0], cmap='gray')
plt.show()
# ----------------------
# 7. Final Model Comparison Summary
# ----------------------
# You can compare the results side by side in a table or any custom visualization
print("\n[COMPARISON SUMMARY]")
print(f"SVM (HOG) Accuracy: {svm.score(x_test_hog, y_test_hog) * 100:.2f}%")
print(f"CNN Accuracy: {acc * 100:.2f}%")
# CLIP and ViT don't have a direct "accuracy" metric, but you can report similarity scores for CLIP or logits for ViT.
print(f"CLIP Similarity Score: {logits_per_image}")
print(f"ViT Logits: {outputs_vit.logits}")