| import os
|
| import numpy as np
|
| import torch
|
| import cv2
|
| import matplotlib.pyplot as plt
|
| from sklearn.svm import SVC
|
| from sklearn.metrics import classification_report, confusion_matrix
|
| from sklearn.model_selection import train_test_split
|
| from skimage.feature import hog
|
| from tensorflow.keras.models import Sequential
|
| from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
|
| from tensorflow.keras.utils import to_categorical
|
| from tensorflow.keras.preprocessing.image import load_img, img_to_array
|
| from sklearn.preprocessing import LabelEncoder
|
| from transformers import CLIPProcessor, CLIPModel
|
| from transformers import ViTForImageClassification, ViTFeatureExtractor
|
| from skimage.color import rgb2gray
|
| from skimage.feature import hog
|
| import numpy as np
|
| from PIL import Image
|
|
|
|
|
|
|
|
|
| def load_custom_dataset(dataset_path, image_size=(48, 48)):
|
| images = []
|
| labels = []
|
| label_map = {}
|
| label_idx = 0
|
|
|
|
|
| for folder_name in os.listdir(dataset_path):
|
| folder_path = os.path.join(dataset_path, folder_name)
|
|
|
|
|
| if not os.path.isdir(folder_path):
|
| continue
|
|
|
|
|
| if folder_name not in label_map:
|
| label_map[folder_name] = label_idx
|
| label_idx += 1
|
|
|
|
|
| for img_name in os.listdir(folder_path):
|
| img_path = os.path.join(folder_path, img_name)
|
| if img_path.endswith('.jpg') or img_path.endswith('.png'):
|
| img = load_img(img_path, target_size=image_size, color_mode='grayscale')
|
| img_array = img_to_array(img)
|
| images.append(img_array)
|
| labels.append(label_map[folder_name])
|
|
|
|
|
| images = np.array(images, dtype="float32") / 255.0
|
| labels = np.array(labels)
|
|
|
| return images, labels, label_map
|
|
|
| dataset_path = 'train'
|
| x, y, label_map = load_custom_dataset(dataset_path)
|
|
|
|
|
| print("Class labels:", label_map)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| def extract_hog_features(images):
|
|
|
|
|
| grayscale_images = [img.squeeze() if len(img.shape) == 3 else img for img in images]
|
|
|
|
|
| return np.array([hog(img, pixels_per_cell=(8, 8), cells_per_block=(2, 2)) for img in grayscale_images])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| print("[INFO] Extracting HOG features...")
|
| x_hog = extract_hog_features(x)
|
| x_train_hog, x_test_hog, y_train_hog, y_test_hog = train_test_split(x_hog, y, test_size=0.2, random_state=42)
|
|
|
| print("[INFO] Training SVM classifier...")
|
| svm = SVC(kernel='linear')
|
| svm.fit(x_train_hog, y_train_hog)
|
| y_pred_svm = svm.predict(x_test_hog)
|
|
|
| print("\n[RESULTS] SVM Classification Report")
|
| print(classification_report(y_test_hog, y_pred_svm))
|
|
|
|
|
|
|
|
|
|
|
| y_cnn = to_categorical(y, num_classes=len(label_map))
|
| x_train_cnn, x_test_cnn, y_train_cnn, y_test_cnn = train_test_split(x, y_cnn, test_size=0.2, random_state=42)
|
|
|
| print("[INFO] Building CNN model...")
|
| cnn = Sequential([
|
| Conv2D(32, (3,3), activation='relu', input_shape=(48, 48, 1)),
|
| MaxPooling2D(2,2),
|
| Conv2D(64, (3,3), activation='relu'),
|
| MaxPooling2D(2,2),
|
| Flatten(),
|
| Dense(128, activation='relu'),
|
| Dense(len(label_map), activation='softmax')
|
| ])
|
|
|
| cnn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
|
| cnn.fit(x_train_cnn, y_train_cnn, epochs=3, batch_size=64, validation_split=0.1)
|
|
|
| print("[INFO] Evaluating CNN...")
|
| loss, acc = cnn.evaluate(x_test_cnn, y_test_cnn)
|
| print(f"CNN Accuracy: {acc * 100:.2f}%")
|
|
|
|
|
|
|
|
|
|
|
|
|
| clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
|
| clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| from PIL import Image
|
|
|
|
|
| gray_img = (x[0] * 255).astype(np.uint8).squeeze()
|
| rgb_img = np.stack([gray_img]*3, axis=-1)
|
| pil_img = Image.fromarray(rgb_img).resize((224, 224))
|
|
|
|
|
| text_input = ["Emotion description of the image"]
|
|
|
|
|
| inputs = clip_processor(text=text_input, images=pil_img, return_tensors="pt", padding=True)
|
|
|
| outputs = clip_model(**inputs)
|
|
|
|
|
| logits_per_image = outputs.logits_per_image
|
| logits_per_text = outputs.logits_per_text
|
|
|
| print("\n[RESULTS] CLIP Similarity Scores")
|
| print(f"Logits per image: {logits_per_image}")
|
| print(f"Logits per text: {logits_per_text}")
|
|
|
|
|
|
|
|
|
|
|
|
|
| vit_model = ViTForImageClassification.from_pretrained("google/vit-base-patch16-224-in21k")
|
| vit_feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
|
|
|
|
|
| inputs_vit = vit_feature_extractor(x[0], return_tensors="pt")
|
| outputs_vit = vit_model(**inputs_vit)
|
|
|
| print("\n[RESULTS] ViT Classification Scores")
|
| print(f"ViT logits: {outputs_vit.logits}")
|
|
|
|
|
|
|
|
|
|
|
|
|
| plt.figure(figsize=(12, 6))
|
|
|
|
|
| plt.subplot(131)
|
| plt.title("Sample Image")
|
| plt.imshow(x[0], cmap='gray')
|
|
|
|
|
| plt.subplot(132)
|
| plt.title("HOG Feature Visualization")
|
| hog_img = hog(x[0], visualize=True)[1]
|
| plt.imshow(hog_img, cmap='gray')
|
|
|
|
|
| plt.subplot(133)
|
| plt.title("CNN Evaluation")
|
| cnn_img = cnn.predict(np.expand_dims(x[0], axis=0))
|
| plt.imshow(cnn_img[0], cmap='gray')
|
|
|
| plt.show()
|
|
|
|
|
|
|
|
|
|
|
|
|
| print("\n[COMPARISON SUMMARY]")
|
| print(f"SVM (HOG) Accuracy: {svm.score(x_test_hog, y_test_hog) * 100:.2f}%")
|
| print(f"CNN Accuracy: {acc * 100:.2f}%")
|
|
|
|
|
| print(f"CLIP Similarity Score: {logits_per_image}")
|
| print(f"ViT Logits: {outputs_vit.logits}")
|
|
|