Spaces:
Running
Running
| """Benchmark suite for baseline vs upgraded DDI models.""" | |
| from __future__ import annotations | |
| import argparse | |
| import json | |
| from pathlib import Path | |
| from typing import Dict | |
| import numpy as np | |
| from sklearn.metrics import accuracy_score, average_precision_score, confusion_matrix, f1_score, precision_recall_fscore_support, recall_score, roc_auc_score | |
| from training.calibration import expected_calibration_error | |
| def evaluate(y_true: np.ndarray, y_pred: np.ndarray, y_proba: np.ndarray) -> Dict: | |
| num_classes = y_proba.shape[1] | |
| major_idx = num_classes - 1 | |
| precision, recall, f1, support = precision_recall_fscore_support(y_true, y_pred, labels=list(range(num_classes)), zero_division=0) | |
| # One-vs-rest AUROC/AUPRC macro | |
| y_true_ovr = np.eye(num_classes)[y_true] | |
| auroc = float(roc_auc_score(y_true_ovr, y_proba, average='macro', multi_class='ovr')) | |
| auprc = float(average_precision_score(y_true_ovr, y_proba, average='macro')) | |
| ece = float(expected_calibration_error(y_true, y_proba, n_bins=15)) | |
| cm = confusion_matrix(y_true, y_pred, labels=list(range(num_classes))) | |
| return { | |
| 'accuracy': float(accuracy_score(y_true, y_pred)), | |
| 'macro_precision': float(np.mean(precision)), | |
| 'macro_recall': float(np.mean(recall)), | |
| 'macro_f1': float(f1_score(y_true, y_pred, average='macro', zero_division=0)), | |
| 'severe_recall': float(recall_score(y_true, y_pred, labels=[major_idx], average='macro', zero_division=0)), | |
| 'auroc_macro_ovr': auroc, | |
| 'auprc_macro': auprc, | |
| 'ece': ece, | |
| 'confusion_matrix': cm.tolist(), | |
| 'per_class': { | |
| str(i): { | |
| 'precision': float(precision[i]), | |
| 'recall': float(recall[i]), | |
| 'f1': float(f1[i]), | |
| 'support': int(support[i]), | |
| } | |
| for i in range(num_classes) | |
| }, | |
| } | |
| def main() -> None: | |
| parser = argparse.ArgumentParser(description='Benchmark baseline vs upgraded predictions') | |
| parser.add_argument('--baseline-json', type=str, required=True, help='JSON with y_true,y_pred,y_proba for baseline') | |
| parser.add_argument('--upgraded-json', type=str, required=True, help='JSON with y_true,y_pred,y_proba for upgraded model') | |
| parser.add_argument('--out-json', type=str, required=True) | |
| args = parser.parse_args() | |
| baseline = json.loads(Path(args.baseline_json).read_text(encoding='utf-8')) | |
| upgraded = json.loads(Path(args.upgraded_json).read_text(encoding='utf-8')) | |
| y_true = np.array(baseline['y_true'], dtype=np.int64) | |
| base_pred = np.array(baseline['y_pred'], dtype=np.int64) | |
| base_proba = np.array(baseline['y_proba'], dtype=np.float32) | |
| up_true = np.array(upgraded['y_true'], dtype=np.int64) | |
| up_pred = np.array(upgraded['y_pred'], dtype=np.int64) | |
| up_proba = np.array(upgraded['y_proba'], dtype=np.float32) | |
| if y_true.shape != up_true.shape or not np.array_equal(y_true, up_true): | |
| raise ValueError('baseline and upgraded y_true must match exactly for fair benchmark') | |
| b = evaluate(y_true, base_pred, base_proba) | |
| u = evaluate(y_true, up_pred, up_proba) | |
| out = { | |
| 'baseline': b, | |
| 'upgraded': u, | |
| 'delta': { | |
| 'accuracy': u['accuracy'] - b['accuracy'], | |
| 'macro_f1': u['macro_f1'] - b['macro_f1'], | |
| 'severe_recall': u['severe_recall'] - b['severe_recall'], | |
| 'auroc_macro_ovr': u['auroc_macro_ovr'] - b['auroc_macro_ovr'], | |
| 'auprc_macro': u['auprc_macro'] - b['auprc_macro'], | |
| }, | |
| } | |
| out_path = Path(args.out_json) | |
| out_path.parent.mkdir(parents=True, exist_ok=True) | |
| out_path.write_text(json.dumps(out, indent=2), encoding='utf-8') | |
| if __name__ == '__main__': | |
| main() | |