# Copyright 2020-present, Pietro Buzzega, Matteo Boschini, Angelo Porrello, Davide Abati, Simone Calderara. # All rights reserved. # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. import numpy as np import torch from sklearn import metrics def backward_transfer(results): n_tasks = len(results) li = [] for i in range(n_tasks - 1): li.append(results[-1][i] - results[i][i]) return np.mean(li) def forward_transfer(results, random_results): n_tasks = len(results) li = [] for i in range(1, n_tasks): li.append(results[i - 1][i] - random_results[i]) return np.mean(li) def forgetting(results): n_tasks = len(results) li = [] for i in range(n_tasks - 1): results[i] += [0.0] * (n_tasks - len(results[i])) np_res = np.array(results) maxx = np.max(np_res, axis=0) for i in range(n_tasks - 1): li.append(maxx[i] - results[-1][i]) return np.mean(li) def calc_aurc_eaurc(softmax, correct): softmax = np.array(softmax) correctness = np.array(correct) softmax_max = np.max(softmax, 1) sort_values = sorted(zip(softmax_max[:], correctness[:]), key=lambda x:x[0], reverse=True) sort_softmax_max, sort_correctness = zip(*sort_values) risk_li, coverage_li = coverage_risk(sort_softmax_max, sort_correctness) aurc, eaurc = aurc_eaurc(risk_li) return aurc, eaurc def calc_fpr_aupr(softmax, correct): softmax = np.array(softmax) correctness = np.array(correct) softmax_max = np.max(softmax, 1) fpr, tpr, thresholds = metrics.roc_curve(correctness, softmax_max) auroc = metrics.auc(fpr, tpr) idx_tpr_95 = np.argmin(np.abs(tpr - 0.95)) fpr_in_tpr_95 = fpr[idx_tpr_95] precision, recall, thresholds = metrics.precision_recall_curve(correctness, softmax_max) aupr_success = metrics.auc(recall, precision) aupr_err = metrics.average_precision_score(-1 * correctness + 1, -1 * softmax_max) return auroc, aupr_success, aupr_err, fpr_in_tpr_95 def calc_ace(softmax_outputs, targets, num_bins=15): """ Calculate Adaptive Calibration Error (ACE) Args: softmax_outputs: numpy array of shape (n_samples, n_classes) - softmax probabilities targets: numpy array of shape (n_samples,) - true labels num_bins: number of bins for calibration Returns: ace: Adaptive Calibration Error value """ confidences = np.max(softmax_outputs, axis=1) predictions = np.argmax(softmax_outputs, axis=1) accuracies = (predictions == targets).astype(float) bin_boundaries = np.quantile(confidences, np.linspace(0, 1, num_bins + 1)) bin_boundaries[0] = 0.0 bin_boundaries[-1] = 1.0 bin_boundaries = np.unique(bin_boundaries) actual_num_bins = len(bin_boundaries) - 1 ace = 0.0 total_samples = len(confidences) for i in range(actual_num_bins): bin_lower = bin_boundaries[i] bin_upper = bin_boundaries[i + 1] if i == actual_num_bins - 1: in_bin = (confidences >= bin_lower) & (confidences <= bin_upper) else: in_bin = (confidences >= bin_lower) & (confidences < bin_upper) if np.sum(in_bin) > 0: bin_confidence = np.mean(confidences[in_bin]) bin_accuracy = np.mean(accuracies[in_bin]) bin_size = np.sum(in_bin) ace += (bin_size / total_samples) * abs(bin_confidence - bin_accuracy) return ace def calc_ece(softmax, label, bins=15): bin_boundaries = torch.linspace(0, 1, bins + 1) bin_lowers = bin_boundaries[:-1] bin_uppers = bin_boundaries[1:] softmax = torch.tensor(softmax) labels = torch.tensor(label) softmax_max, predictions = torch.max(softmax, 1) correctness = predictions.eq(labels.long()) ece = torch.zeros(1) for bin_lower, bin_upper in zip(bin_lowers, bin_uppers): in_bin = softmax_max.gt(bin_lower.item()) * softmax_max.le(bin_upper.item()) prop_in_bin = in_bin.float().mean() if prop_in_bin.item() > 0.0: accuracy_in_bin = correctness[in_bin].float().mean() avg_confidence_in_bin = softmax_max[in_bin].mean() ece += torch.abs(avg_confidence_in_bin - accuracy_in_bin) * prop_in_bin return ece.item() # NLL & Brier Score def calc_nll_brier(softmax, logit, label): nb_cls = logit.shape[1] label_onehot = np.eye(nb_cls)[label] brier_score = np.mean(np.sum((softmax - label_onehot) ** 2, axis=1)) logit = torch.tensor(logit, dtype=torch.float) label = torch.tensor(label, dtype=torch.int) logsoftmax = torch.nn.LogSoftmax(dim=1) log_softmax = logsoftmax(logit) nll = calc_nll(log_softmax, label) return nll.item(), brier_score # Calc NLL def calc_nll(log_softmax, label): out = torch.zeros_like(label, dtype=torch.float) for i in range(len(label)): out[i] = log_softmax[i][label[i]] return -out.sum()/len(out) # Calc coverage, risk def coverage_risk(confidence, correctness): risk_list = [] coverage_list = [] risk = 0 for i in range(len(confidence)): coverage = (i + 1) / len(confidence) coverage_list.append(coverage) if correctness[i] == 0: risk += 1 risk_list.append(risk / (i + 1)) return risk_list, coverage_list # Calc aurc, eaurc def aurc_eaurc(risk_list): r = risk_list[-1] risk_coverage_curve_area = 0 optimal_risk_area = r + (1 - r) * np.log(1 - r) for risk_value in risk_list: risk_coverage_curve_area += risk_value * (1 / len(risk_list)) aurc = risk_coverage_curve_area eaurc = risk_coverage_curve_area - optimal_risk_area return aurc, eaurc