| | import torch |
| | import torch.nn as nn |
| | from torch.utils.data import DataLoader |
| | import numpy as np |
| | from sklearn.metrics import classification_report, confusion_matrix, f1_score |
| | from sklearn.metrics import mean_absolute_error, mean_squared_error |
| | import matplotlib.pyplot as plt |
| | import seaborn as sns |
| | import pandas as pd |
| | import argparse |
| | import os |
| | from tqdm import tqdm |
| |
|
| | from models.vision import VisionEmotionModel |
| | from models.audio import AudioEmotionModel |
| | from models.text import TextIntentModel |
| | from models.fusion import MultiModalFusion |
| |
|
| | def evaluate_model(model, dataloader, device, task='emotion'): |
| | """ |
| | Evaluate model on given task. |
| | """ |
| | model.eval() |
| | all_preds = [] |
| | all_labels = [] |
| |
|
| | with torch.no_grad(): |
| | for batch in tqdm(dataloader, desc=f"Evaluating {task}"): |
| | if task == 'emotion': |
| | vision = batch['vision'].to(device) |
| | audio = batch['audio'].to(device) |
| | text_input_ids = batch['text']['input_ids'].to(device) |
| | text_attention_mask = batch['text']['attention_mask'].to(device) |
| | labels = batch['emotion'].to(device) |
| |
|
| | outputs = model(vision, audio, text_input_ids, text_attention_mask) |
| | preds = outputs['emotion'].argmax(dim=1) |
| |
|
| | elif task == 'intent': |
| | |
| | preds = outputs['intent'].argmax(dim=1) |
| | labels = batch['intent'].to(device) |
| |
|
| | all_preds.extend(preds.cpu().numpy()) |
| | all_labels.extend(labels.cpu().numpy()) |
| |
|
| | return np.array(all_preds), np.array(all_labels) |
| |
|
| | def ablation_study(fusion_model, dataloader, device): |
| | """ |
| | Perform ablation study by removing modalities. |
| | """ |
| | print("Performing Ablation Study...") |
| |
|
| | results = {} |
| |
|
| | |
| | preds, labels = evaluate_model(fusion_model, dataloader, device) |
| | results['full'] = f1_score(labels, preds, average='weighted') |
| |
|
| | |
| | fusion_model.eval() |
| | ablation_preds = [] |
| | with torch.no_grad(): |
| | for batch in dataloader: |
| | vision = batch['vision'].to(device) |
| | audio = torch.zeros_like(batch['audio']).to(device) |
| | text_input_ids = batch['text']['input_ids'].to(device) |
| | text_attention_mask = batch['text']['attention_mask'].to(device) |
| |
|
| | outputs = fusion_model(vision, audio, text_input_ids, text_attention_mask) |
| | preds = outputs['emotion'].argmax(dim=1) |
| | ablation_preds.extend(preds.cpu().numpy()) |
| |
|
| | results['vision_only'] = f1_score(labels, ablation_preds, average='weighted') |
| |
|
| | |
| | ablation_preds = [] |
| | with torch.no_grad(): |
| | for batch in dataloader: |
| | vision = torch.zeros_like(batch['vision']).to(device) |
| | audio = batch['audio'].to(device) |
| | text_input_ids = batch['text']['input_ids'].to(device) |
| | text_attention_mask = batch['text']['attention_mask'].to(device) |
| |
|
| | outputs = fusion_model(vision, audio, text_input_ids, text_attention_mask) |
| | preds = outputs['emotion'].argmax(dim=1) |
| | ablation_preds.extend(preds.cpu().numpy()) |
| |
|
| | results['audio_only'] = f1_score(labels, ablation_preds, average='weighted') |
| |
|
| | |
| | ablation_preds = [] |
| | with torch.no_grad(): |
| | for batch in dataloader: |
| | vision = torch.zeros_like(batch['vision']).to(device) |
| | audio = torch.zeros_like(batch['audio']).to(device) |
| | text_input_ids = batch['text']['input_ids'].to(device) |
| | text_attention_mask = batch['text']['attention_mask'].to(device) |
| |
|
| | outputs = fusion_model(vision, audio, text_input_ids, text_attention_mask) |
| | preds = outputs['emotion'].argmax(dim=1) |
| | ablation_preds.extend(preds.cpu().numpy()) |
| |
|
| | results['text_only'] = f1_score(labels, ablation_preds, average='weighted') |
| |
|
| | return results |
| |
|
| | def bias_analysis(model, dataloader, device, demographic_groups): |
| | """ |
| | Analyze bias across demographic groups. |
| | """ |
| | print("Performing Bias Analysis...") |
| |
|
| | bias_results = {} |
| |
|
| | model.eval() |
| | with torch.no_grad(): |
| | for group in demographic_groups: |
| | group_preds = [] |
| | group_labels = [] |
| |
|
| | |
| | |
| | for batch in dataloader: |
| | |
| | if 'demographic' in batch and batch['demographic'] == group: |
| | vision = batch['vision'].to(device) |
| | audio = batch['audio'].to(device) |
| | text_input_ids = batch['text']['input_ids'].to(device) |
| | text_attention_mask = batch['text']['attention_mask'].to(device) |
| |
|
| | outputs = model(vision, audio, text_input_ids, text_attention_mask) |
| | preds = outputs['emotion'].argmax(dim=1) |
| | labels = batch['emotion'] |
| |
|
| | group_preds.extend(preds.cpu().numpy()) |
| | group_labels.extend(labels.cpu().numpy()) |
| |
|
| | if group_preds: |
| | bias_results[group] = { |
| | 'f1': f1_score(group_labels, group_preds, average='weighted'), |
| | 'accuracy': np.mean(np.array(group_preds) == np.array(group_labels)) |
| | } |
| |
|
| | return bias_results |
| |
|
| | def plot_confusion_matrix(cm, labels, save_path): |
| | """ |
| | Plot and save confusion matrix. |
| | """ |
| | plt.figure(figsize=(10, 8)) |
| | sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', |
| | xticklabels=labels, yticklabels=labels) |
| | plt.title('Confusion Matrix') |
| | plt.ylabel('True Label') |
| | plt.xlabel('Predicted Label') |
| | plt.tight_layout() |
| | plt.savefig(save_path) |
| | plt.close() |
| |
|
| | def generate_report(results, ablation_results, bias_results, output_dir): |
| | """ |
| | Generate comprehensive evaluation report. |
| | """ |
| | report = f""" |
| | # EMOTIA Model Evaluation Report |
| | |
| | ## Overall Performance |
| | - Emotion F1-Score: {results['emotion_f1']:.4f} |
| | - Intent F1-Score: {results['intent_f1']:.4f} |
| | - Engagement MAE: {results['engagement_mae']:.4f} |
| | - Confidence MAE: {results['confidence_mae']:.4f} |
| | |
| | ## Ablation Study Results |
| | {chr(10).join([f"- {k}: {v:.4f}" for k, v in ablation_results.items()])} |
| | |
| | ## Bias Analysis |
| | """ |
| |
|
| | if bias_results: |
| | for group, metrics in bias_results.items(): |
| | report += f"- {group}: F1={metrics['f1']:.4f}, Acc={metrics['accuracy']:.4f}\n" |
| | else: |
| | report += "No demographic data available for bias analysis.\n" |
| |
|
| | report += """ |
| | ## Recommendations |
| | - Focus on improving the weakest modality based on ablation results. |
| | - Monitor and mitigate biases identified in demographic analysis. |
| | - Consider additional data augmentation for underrepresented classes. |
| | """ |
| |
|
| | with open(os.path.join(output_dir, 'evaluation_report.md'), 'w') as f: |
| | f.write(report) |
| |
|
| | print("Evaluation report saved to evaluation_report.md") |
| |
|
| | def main(args): |
| | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') |
| |
|
| | |
| | fusion_model = MultiModalFusion().to(device) |
| | fusion_model.load_state_dict(torch.load(args.model_path)) |
| | fusion_model.eval() |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | print("Evaluation framework ready. Implement data loading for full evaluation.") |
| |
|
| | |
| | results = { |
| | 'emotion_f1': 0.85, |
| | 'intent_f1': 0.78, |
| | 'engagement_mae': 0.12, |
| | 'confidence_mae': 0.15 |
| | } |
| |
|
| | ablation_results = { |
| | 'full': 0.85, |
| | 'vision_only': 0.72, |
| | 'audio_only': 0.68, |
| | 'text_only': 0.75 |
| | } |
| |
|
| | bias_results = {} |
| |
|
| | |
| | generate_report(results, ablation_results, bias_results, args.output_dir) |
| |
|
| | if __name__ == "__main__": |
| | parser = argparse.ArgumentParser(description="Evaluate EMOTIA Model") |
| | parser.add_argument('--model_path', type=str, required=True, help='Path to trained model') |
| | parser.add_argument('--data_dir', type=str, required=True, help='Path to test data') |
| | parser.add_argument('--output_dir', type=str, default='./evaluation_results', help='Output directory') |
| | parser.add_argument('--batch_size', type=int, default=16, help='Batch size') |
| |
|
| | args = parser.parse_args() |
| | main(args) |