File size: 6,676 Bytes
eacd6a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
# research/evaluation.py

import torch
import pandas as pd
import json
from pathlib import Path
import sys
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
from tqdm import tqdm

# Add the project's src directory to the Python path
# This allows us to import our custom modules
src_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'src'))
sys.path.append(src_path)

# Now we can import our custom classes
from cnnClassifier.components.multi_task_model_trainer import MultiTaskEfficientNet, FairFaceDataset
from cnnClassifier.utils.common import read_yaml
from torchvision.transforms import Compose, Resize, ToTensor, Normalize
from transformers import AutoImageProcessor

# ==============================================================================
# CONFIGURATION
# ==============================================================================
# Define paths directly. We are not using the config manager.
MODEL_PATH = Path("artifacts/multi_task_model_trainer/facial_demographics_model")
DATA_PATH = Path("artifacts/data_preparation/fairface_cleaned.csv")
PARAMS_PATH = Path("params.yaml")
EVALUATION_OUTPUT_DIR = Path("artifacts/manual_evaluation")
EVALUATION_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Load parameters
params = read_yaml(PARAMS_PATH)
IMAGE_SIZE = params.IMAGE_SIZE
BATCH_SIZE = params.BATCH_SIZE
TEST_SPLIT_SIZE = params.TEST_SPLIT_SIZE
RANDOM_STATE = params.RANDOM_STATE

# ==============================================================================
# MAIN EVALUATION LOGIC
# ==============================================================================
def evaluate_model():
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"--- Running evaluation on device: {device} ---")

    # 1. Load data and prepare the test split
    print("Loading and preparing test data...")
    df = pd.read_csv(DATA_PATH)
    
    label_maps = {}
    for task in ['age', 'gender', 'race']:
        labels = sorted(df[task].unique())
        label_maps[f'{task}_label2id'] = {label: i for i, label in enumerate(labels)}
        label_maps[f'{task}_id2label'] = {i: label for i, label in enumerate(labels)}
        df[f'{task}_id'] = df[task].map(label_maps[f'{task}_label2id'])
    
    # Use the same random_state to ensure we get the identical test split as in training
    _, test_df = train_test_split(
        df, 
        test_size=TEST_SPLIT_SIZE, 
        random_state=RANDOM_STATE, 
        stratify=df['age']
    )
    
    # 2. Create the PyTorch DataLoader
    model_config = read_yaml(Path("config/config.yaml"))
    base_model_name = model_config.multi_task_model_trainer.model_name
    processor = AutoImageProcessor.from_pretrained(base_model_name)
    _transforms = Compose([
        Resize((IMAGE_SIZE, IMAGE_SIZE)),
        ToTensor(),
        Normalize(mean=processor.image_mean, std=processor.image_std)
    ])
    
    test_dataset = FairFaceDataset(dataframe=test_df, processor=processor, transforms=_transforms)
    test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

    # 3. Load the trained model
    print(f"Loading model from: {MODEL_PATH}")
    model = MultiTaskEfficientNet(
        model_name=str(MODEL_PATH), # Pass the path as the model name
        num_labels_age=len(label_maps['age_id2label']),
        num_labels_gender=len(label_maps['gender_id2label']),
        num_labels_race=len(label_maps['race_id2label']),
    ).to(device)
    
    # Load the trained weights
    model.load_state_dict(torch.load(MODEL_PATH / 'pytorch_model.bin', map_location=device))
    model.eval()

    # 4. Run predictions on the test set
    print("Running predictions on the test set...")
    all_preds = {'age': [], 'gender': [], 'race': []}
    all_labels = {'age': [], 'gender': [], 'race': []}

    for batch in tqdm(test_dataloader, desc="Evaluating"):
        pixel_values = batch['pixel_values'].to(device)
        labels = batch['labels']
        
        with torch.no_grad():
            outputs = model(pixel_values=pixel_values)
        
        all_preds['age'].extend(outputs['age_logits'].argmax(1).cpu().numpy())
        all_preds['gender'].extend(outputs['gender_logits'].argmax(1).cpu().numpy())
        all_preds['race'].extend(outputs['race_logits'].argmax(1).cpu().numpy())

        all_labels['age'].extend(labels['age'].cpu().numpy())
        all_labels['gender'].extend(labels['gender'].cpu().numpy())
        all_labels['race'].extend(labels['race'].cpu().numpy())

    # 5. Calculate metrics, generate reports, and save artifacts
    print("--- Evaluation Results ---")
    metrics = {}
    for task in ['age', 'gender', 'race']:
        accuracy = accuracy_score(all_labels[task], all_preds[task])
        print(f"\n--- {task.capitalize()} ---")
        print(f"Accuracy: {accuracy:.4f}")
        
        report_str = classification_report(
            all_labels[task], 
            all_preds[task], 
            target_names=list(label_maps[f'{task}_id2label'].values())
        )
        print("Classification Report:")
        print(report_str)
        
        metrics[f'{task}_accuracy'] = accuracy
        
        # Confusion Matrix
        cm = confusion_matrix(all_labels[task], all_preds[task])
        plt.figure(figsize=(12, 10))
        sns.heatmap(cm, annot=True, fmt='d', xticklabels=list(label_maps[f'{task}_id2label'].values()), yticklabels=list(label_maps[f'{task}_id2label'].values()), cmap='Blues')
        plt.title(f'Confusion Matrix for {task.capitalize()}', fontsize=16)
        plt.ylabel('Actual', fontsize=12)
        plt.xlabel('Predicted', fontsize=12)
        plt.xticks(rotation=45)
        plt.yticks(rotation=0)
        
        cm_path = EVALUATION_OUTPUT_DIR / f'{task}_confusion_matrix.png'
        plt.savefig(cm_path, bbox_inches='tight')
        plt.close() # Close the plot to avoid displaying it in the console
        print(f"Saved {task} confusion matrix to {cm_path}")

    # Save metrics to a JSON file
    metrics_path = EVALUATION_OUTPUT_DIR / "metrics.json"
    with open(metrics_path, 'w') as f:
        json.dump(metrics, f, indent=4)
    print(f"\nSaved final metrics to {metrics_path}")
    
    # Save the label maps used for this evaluation run
    label_maps_path = EVALUATION_OUTPUT_DIR / "label_maps.json"
    with open(label_maps_path, 'w') as f:
        json.dump(label_maps, f, indent=4)
    print(f"Saved label maps to {label_maps_path}")


if __name__ == '__main__':
    evaluate_model()