| |
| |
| |
| |
|
|
| import numpy as np |
| import torch |
| from sklearn import metrics |
|
|
| def backward_transfer(results): |
| n_tasks = len(results) |
| li = [] |
| for i in range(n_tasks - 1): |
| li.append(results[-1][i] - results[i][i]) |
|
|
| return np.mean(li) |
|
|
| def forward_transfer(results, random_results): |
| n_tasks = len(results) |
| li = [] |
| for i in range(1, n_tasks): |
| li.append(results[i - 1][i] - random_results[i]) |
|
|
| return np.mean(li) |
|
|
| def forgetting(results): |
| n_tasks = len(results) |
| li = [] |
| for i in range(n_tasks - 1): |
| results[i] += [0.0] * (n_tasks - len(results[i])) |
| np_res = np.array(results) |
| maxx = np.max(np_res, axis=0) |
| for i in range(n_tasks - 1): |
| li.append(maxx[i] - results[-1][i]) |
|
|
| return np.mean(li) |
| def calc_aurc_eaurc(softmax, correct): |
| softmax = np.array(softmax) |
| correctness = np.array(correct) |
| softmax_max = np.max(softmax, 1) |
|
|
| sort_values = sorted(zip(softmax_max[:], correctness[:]), key=lambda x:x[0], reverse=True) |
| sort_softmax_max, sort_correctness = zip(*sort_values) |
| risk_li, coverage_li = coverage_risk(sort_softmax_max, sort_correctness) |
| aurc, eaurc = aurc_eaurc(risk_li) |
|
|
| return aurc, eaurc |
|
|
| def calc_fpr_aupr(softmax, correct): |
| softmax = np.array(softmax) |
| correctness = np.array(correct) |
| softmax_max = np.max(softmax, 1) |
|
|
| fpr, tpr, thresholds = metrics.roc_curve(correctness, softmax_max) |
| auroc = metrics.auc(fpr, tpr) |
| idx_tpr_95 = np.argmin(np.abs(tpr - 0.95)) |
| fpr_in_tpr_95 = fpr[idx_tpr_95] |
|
|
| precision, recall, thresholds = metrics.precision_recall_curve(correctness, softmax_max) |
| aupr_success = metrics.auc(recall, precision) |
| aupr_err = metrics.average_precision_score(-1 * correctness + 1, -1 * softmax_max) |
|
|
|
|
| return auroc, aupr_success, aupr_err, fpr_in_tpr_95 |
|
|
| def calc_ace(softmax_outputs, targets, num_bins=15): |
| """ |
| Calculate Adaptive Calibration Error (ACE) |
| |
| Args: |
| softmax_outputs: numpy array of shape (n_samples, n_classes) - softmax probabilities |
| targets: numpy array of shape (n_samples,) - true labels |
| num_bins: number of bins for calibration |
| |
| Returns: |
| ace: Adaptive Calibration Error value |
| """ |
|
|
| confidences = np.max(softmax_outputs, axis=1) |
| predictions = np.argmax(softmax_outputs, axis=1) |
| |
| accuracies = (predictions == targets).astype(float) |
|
|
| bin_boundaries = np.quantile(confidences, np.linspace(0, 1, num_bins + 1)) |
| bin_boundaries[0] = 0.0 |
| bin_boundaries[-1] = 1.0 |
| |
| bin_boundaries = np.unique(bin_boundaries) |
| actual_num_bins = len(bin_boundaries) - 1 |
| |
| ace = 0.0 |
| total_samples = len(confidences) |
| |
| for i in range(actual_num_bins): |
| bin_lower = bin_boundaries[i] |
| bin_upper = bin_boundaries[i + 1] |
| |
| if i == actual_num_bins - 1: |
| in_bin = (confidences >= bin_lower) & (confidences <= bin_upper) |
| else: |
| in_bin = (confidences >= bin_lower) & (confidences < bin_upper) |
| |
| if np.sum(in_bin) > 0: |
| bin_confidence = np.mean(confidences[in_bin]) |
| bin_accuracy = np.mean(accuracies[in_bin]) |
| bin_size = np.sum(in_bin) |
| ace += (bin_size / total_samples) * abs(bin_confidence - bin_accuracy) |
| |
| return ace |
|
|
| def calc_ece(softmax, label, bins=15): |
| bin_boundaries = torch.linspace(0, 1, bins + 1) |
| bin_lowers = bin_boundaries[:-1] |
| bin_uppers = bin_boundaries[1:] |
|
|
| softmax = torch.tensor(softmax) |
| labels = torch.tensor(label) |
|
|
| softmax_max, predictions = torch.max(softmax, 1) |
| correctness = predictions.eq(labels.long()) |
|
|
| ece = torch.zeros(1) |
|
|
| for bin_lower, bin_upper in zip(bin_lowers, bin_uppers): |
| in_bin = softmax_max.gt(bin_lower.item()) * softmax_max.le(bin_upper.item()) |
| prop_in_bin = in_bin.float().mean() |
|
|
| if prop_in_bin.item() > 0.0: |
| accuracy_in_bin = correctness[in_bin].float().mean() |
| avg_confidence_in_bin = softmax_max[in_bin].mean() |
|
|
| ece += torch.abs(avg_confidence_in_bin - accuracy_in_bin) * prop_in_bin |
|
|
| return ece.item() |
|
|
| |
| def calc_nll_brier(softmax, logit, label): |
| nb_cls = logit.shape[1] |
| label_onehot = np.eye(nb_cls)[label] |
| brier_score = np.mean(np.sum((softmax - label_onehot) ** 2, axis=1)) |
|
|
| logit = torch.tensor(logit, dtype=torch.float) |
| label = torch.tensor(label, dtype=torch.int) |
| logsoftmax = torch.nn.LogSoftmax(dim=1) |
|
|
| log_softmax = logsoftmax(logit) |
| nll = calc_nll(log_softmax, label) |
|
|
|
|
| return nll.item(), brier_score |
|
|
| |
| def calc_nll(log_softmax, label): |
| out = torch.zeros_like(label, dtype=torch.float) |
| for i in range(len(label)): |
| out[i] = log_softmax[i][label[i]] |
|
|
| return -out.sum()/len(out) |
|
|
| |
| def coverage_risk(confidence, correctness): |
| risk_list = [] |
| coverage_list = [] |
| risk = 0 |
| for i in range(len(confidence)): |
| coverage = (i + 1) / len(confidence) |
| coverage_list.append(coverage) |
|
|
| if correctness[i] == 0: |
| risk += 1 |
|
|
| risk_list.append(risk / (i + 1)) |
|
|
| return risk_list, coverage_list |
|
|
| |
| def aurc_eaurc(risk_list): |
| r = risk_list[-1] |
| risk_coverage_curve_area = 0 |
| optimal_risk_area = r + (1 - r) * np.log(1 - r) |
| for risk_value in risk_list: |
| risk_coverage_curve_area += risk_value * (1 / len(risk_list)) |
|
|
| aurc = risk_coverage_curve_area |
| eaurc = risk_coverage_curve_area - optimal_risk_area |
|
|
| return aurc, eaurc |