"""Benchmark suite for baseline vs upgraded DDI models.""" from __future__ import annotations import argparse import json from pathlib import Path from typing import Dict import numpy as np from sklearn.metrics import accuracy_score, average_precision_score, confusion_matrix, f1_score, precision_recall_fscore_support, recall_score, roc_auc_score from training.calibration import expected_calibration_error def evaluate(y_true: np.ndarray, y_pred: np.ndarray, y_proba: np.ndarray) -> Dict: num_classes = y_proba.shape[1] major_idx = num_classes - 1 precision, recall, f1, support = precision_recall_fscore_support(y_true, y_pred, labels=list(range(num_classes)), zero_division=0) # One-vs-rest AUROC/AUPRC macro y_true_ovr = np.eye(num_classes)[y_true] auroc = float(roc_auc_score(y_true_ovr, y_proba, average='macro', multi_class='ovr')) auprc = float(average_precision_score(y_true_ovr, y_proba, average='macro')) ece = float(expected_calibration_error(y_true, y_proba, n_bins=15)) cm = confusion_matrix(y_true, y_pred, labels=list(range(num_classes))) return { 'accuracy': float(accuracy_score(y_true, y_pred)), 'macro_precision': float(np.mean(precision)), 'macro_recall': float(np.mean(recall)), 'macro_f1': float(f1_score(y_true, y_pred, average='macro', zero_division=0)), 'severe_recall': float(recall_score(y_true, y_pred, labels=[major_idx], average='macro', zero_division=0)), 'auroc_macro_ovr': auroc, 'auprc_macro': auprc, 'ece': ece, 'confusion_matrix': cm.tolist(), 'per_class': { str(i): { 'precision': float(precision[i]), 'recall': float(recall[i]), 'f1': float(f1[i]), 'support': int(support[i]), } for i in range(num_classes) }, } def main() -> None: parser = argparse.ArgumentParser(description='Benchmark baseline vs upgraded predictions') parser.add_argument('--baseline-json', type=str, required=True, help='JSON with y_true,y_pred,y_proba for baseline') parser.add_argument('--upgraded-json', type=str, required=True, help='JSON with y_true,y_pred,y_proba for upgraded model') parser.add_argument('--out-json', type=str, required=True) args = parser.parse_args() baseline = json.loads(Path(args.baseline_json).read_text(encoding='utf-8')) upgraded = json.loads(Path(args.upgraded_json).read_text(encoding='utf-8')) y_true = np.array(baseline['y_true'], dtype=np.int64) base_pred = np.array(baseline['y_pred'], dtype=np.int64) base_proba = np.array(baseline['y_proba'], dtype=np.float32) up_true = np.array(upgraded['y_true'], dtype=np.int64) up_pred = np.array(upgraded['y_pred'], dtype=np.int64) up_proba = np.array(upgraded['y_proba'], dtype=np.float32) if y_true.shape != up_true.shape or not np.array_equal(y_true, up_true): raise ValueError('baseline and upgraded y_true must match exactly for fair benchmark') b = evaluate(y_true, base_pred, base_proba) u = evaluate(y_true, up_pred, up_proba) out = { 'baseline': b, 'upgraded': u, 'delta': { 'accuracy': u['accuracy'] - b['accuracy'], 'macro_f1': u['macro_f1'] - b['macro_f1'], 'severe_recall': u['severe_recall'] - b['severe_recall'], 'auroc_macro_ovr': u['auroc_macro_ovr'] - b['auroc_macro_ovr'], 'auprc_macro': u['auprc_macro'] - b['auprc_macro'], }, } out_path = Path(args.out_json) out_path.parent.mkdir(parents=True, exist_ok=True) out_path.write_text(json.dumps(out, indent=2), encoding='utf-8') if __name__ == '__main__': main()