ALYYAN's picture
Backend + Frontend done
eacd6a2
# research/evaluation.py
import torch
import pandas as pd
import json
from pathlib import Path
import sys
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
from tqdm import tqdm
# Add the project's src directory to the Python path
# This allows us to import our custom modules
src_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'src'))
sys.path.append(src_path)
# Now we can import our custom classes
from cnnClassifier.components.multi_task_model_trainer import MultiTaskEfficientNet, FairFaceDataset
from cnnClassifier.utils.common import read_yaml
from torchvision.transforms import Compose, Resize, ToTensor, Normalize
from transformers import AutoImageProcessor
# ==============================================================================
# CONFIGURATION
# ==============================================================================
# Define paths directly. We are not using the config manager.
MODEL_PATH = Path("artifacts/multi_task_model_trainer/facial_demographics_model")
DATA_PATH = Path("artifacts/data_preparation/fairface_cleaned.csv")
PARAMS_PATH = Path("params.yaml")
EVALUATION_OUTPUT_DIR = Path("artifacts/manual_evaluation")
EVALUATION_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
# Load parameters
params = read_yaml(PARAMS_PATH)
IMAGE_SIZE = params.IMAGE_SIZE
BATCH_SIZE = params.BATCH_SIZE
TEST_SPLIT_SIZE = params.TEST_SPLIT_SIZE
RANDOM_STATE = params.RANDOM_STATE
# ==============================================================================
# MAIN EVALUATION LOGIC
# ==============================================================================
def evaluate_model():
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"--- Running evaluation on device: {device} ---")
# 1. Load data and prepare the test split
print("Loading and preparing test data...")
df = pd.read_csv(DATA_PATH)
label_maps = {}
for task in ['age', 'gender', 'race']:
labels = sorted(df[task].unique())
label_maps[f'{task}_label2id'] = {label: i for i, label in enumerate(labels)}
label_maps[f'{task}_id2label'] = {i: label for i, label in enumerate(labels)}
df[f'{task}_id'] = df[task].map(label_maps[f'{task}_label2id'])
# Use the same random_state to ensure we get the identical test split as in training
_, test_df = train_test_split(
df,
test_size=TEST_SPLIT_SIZE,
random_state=RANDOM_STATE,
stratify=df['age']
)
# 2. Create the PyTorch DataLoader
model_config = read_yaml(Path("config/config.yaml"))
base_model_name = model_config.multi_task_model_trainer.model_name
processor = AutoImageProcessor.from_pretrained(base_model_name)
_transforms = Compose([
Resize((IMAGE_SIZE, IMAGE_SIZE)),
ToTensor(),
Normalize(mean=processor.image_mean, std=processor.image_std)
])
test_dataset = FairFaceDataset(dataframe=test_df, processor=processor, transforms=_transforms)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)
# 3. Load the trained model
print(f"Loading model from: {MODEL_PATH}")
model = MultiTaskEfficientNet(
model_name=str(MODEL_PATH), # Pass the path as the model name
num_labels_age=len(label_maps['age_id2label']),
num_labels_gender=len(label_maps['gender_id2label']),
num_labels_race=len(label_maps['race_id2label']),
).to(device)
# Load the trained weights
model.load_state_dict(torch.load(MODEL_PATH / 'pytorch_model.bin', map_location=device))
model.eval()
# 4. Run predictions on the test set
print("Running predictions on the test set...")
all_preds = {'age': [], 'gender': [], 'race': []}
all_labels = {'age': [], 'gender': [], 'race': []}
for batch in tqdm(test_dataloader, desc="Evaluating"):
pixel_values = batch['pixel_values'].to(device)
labels = batch['labels']
with torch.no_grad():
outputs = model(pixel_values=pixel_values)
all_preds['age'].extend(outputs['age_logits'].argmax(1).cpu().numpy())
all_preds['gender'].extend(outputs['gender_logits'].argmax(1).cpu().numpy())
all_preds['race'].extend(outputs['race_logits'].argmax(1).cpu().numpy())
all_labels['age'].extend(labels['age'].cpu().numpy())
all_labels['gender'].extend(labels['gender'].cpu().numpy())
all_labels['race'].extend(labels['race'].cpu().numpy())
# 5. Calculate metrics, generate reports, and save artifacts
print("--- Evaluation Results ---")
metrics = {}
for task in ['age', 'gender', 'race']:
accuracy = accuracy_score(all_labels[task], all_preds[task])
print(f"\n--- {task.capitalize()} ---")
print(f"Accuracy: {accuracy:.4f}")
report_str = classification_report(
all_labels[task],
all_preds[task],
target_names=list(label_maps[f'{task}_id2label'].values())
)
print("Classification Report:")
print(report_str)
metrics[f'{task}_accuracy'] = accuracy
# Confusion Matrix
cm = confusion_matrix(all_labels[task], all_preds[task])
plt.figure(figsize=(12, 10))
sns.heatmap(cm, annot=True, fmt='d', xticklabels=list(label_maps[f'{task}_id2label'].values()), yticklabels=list(label_maps[f'{task}_id2label'].values()), cmap='Blues')
plt.title(f'Confusion Matrix for {task.capitalize()}', fontsize=16)
plt.ylabel('Actual', fontsize=12)
plt.xlabel('Predicted', fontsize=12)
plt.xticks(rotation=45)
plt.yticks(rotation=0)
cm_path = EVALUATION_OUTPUT_DIR / f'{task}_confusion_matrix.png'
plt.savefig(cm_path, bbox_inches='tight')
plt.close() # Close the plot to avoid displaying it in the console
print(f"Saved {task} confusion matrix to {cm_path}")
# Save metrics to a JSON file
metrics_path = EVALUATION_OUTPUT_DIR / "metrics.json"
with open(metrics_path, 'w') as f:
json.dump(metrics, f, indent=4)
print(f"\nSaved final metrics to {metrics_path}")
# Save the label maps used for this evaluation run
label_maps_path = EVALUATION_OUTPUT_DIR / "label_maps.json"
with open(label_maps_path, 'w') as f:
json.dump(label_maps, f, indent=4)
print(f"Saved label maps to {label_maps_path}")
if __name__ == '__main__':
evaluate_model()