import csv import json import os import logging import numpy as np import pandas as pd from scipy.special import softmax from sklearn.metrics import ( confusion_matrix, recall_score, f1_score, accuracy_score, balanced_accuracy_score, ) from fairlearn.metrics import ( count, false_positive_rate, false_negative_rate, selection_rate, demographic_parity_difference, equal_opportunity_difference, equalized_odds_difference, ) from fairlearn.metrics import MetricFrame logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", handlers=[logging.StreamHandler()], ) def logits_to_probs(logits, config): # Posterioir probabilities are calculated differently in some experiments if config.get("domain_independent_loss", False): per_group = np.split(logits, config["num_groups"], axis=1) marginalized = np.sum(per_group, axis=0) return softmax(marginalized, axis=1) if config.get("domain_discriminative_loss", False): # Prior shift inference, train distribution prior_shift_weight = np.array( [ 1088/1072, 1088/16, 17746/17515, 17746/231, 6454/6273, 6454/181, 850/834, 850/16 ] ) / 100 probs_yd = softmax(logits, axis=1) * prior_shift_weight per_group = np.split(probs_yd, config["num_groups"], axis=1) marginalized = np.sum(per_group, axis=0) # We shifted probs, apply softmax once more return softmax(marginalized, axis=1) return softmax(logits, axis=1) # Fairlearn docs def compute_error_metric(metric_value, sample_size): """Compute standard error of a given metric based on the assumption of normal distribution. Parameters: metric_value: Value of the metric sample_size: Number of data points associated with the metric Returns: The standard error of the metric """ metric_value = metric_value / sample_size return 1.96 * np.sqrt(metric_value * (1.0 - metric_value)) / np.sqrt(sample_size) def false_positive_error(y_true, y_pred): """Compute the standard error for the false positive rate estimate.""" tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel() return compute_error_metric(fp, tn + fp) def false_negative_error(y_true, y_pred): """Compute the standard error for the false negative rate estimate.""" tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel() return compute_error_metric(fn, fn + tp) def balanced_accuracy_error(y_true, y_pred): """Compute the standard error for the balanced accuracy estimate.""" fpr_error, fnr_error = false_positive_error(y_true, y_pred), false_negative_error( y_true, y_pred ) return np.sqrt(fnr_error**2 + fpr_error**2) / 2 if __name__ == "__main__": root_dir = "C:\\Users\\Duje\\Desktop\\fer\\8. semestar\\lumen\\rezultati\\02 eksperimenti\\" common_csv = "rezultati.csv" disagg_csv = "disaggregated.csv" experiments = [ "01 baseline 0304", "02 recall ce 0304", "04 cielab re based", "05 cielab ohem", "08 optim params large", "10 transformer\\normal", "11 transformer ohem", "12 domain discriminative\\new", "13 oversampler", "15 focal loss\\new", "14 domain independent\\new", "16 efficient m\\new", "17 masked\\new", "18 efficient l\\new", "19 oversampler trio\\1 base", "19 oversampler trio\\2 ifw, recall_ce", "19 oversampler trio\\3 ifw, ohem", "20 dino\\new", "21 dino oversample", "22 dino undersample", "23 long train 04" "24 dd transformer" ] logging.info(f"Collecting metrics for {len(experiments)} experiments") for exp in experiments: eval_dir = os.path.join(root_dir, exp, "eval") chkpt = next(os.walk(eval_dir))[1][0] # checkpoint folder logging.info(f"Evaluating checkpoint {chkpt} for experiment {exp}") with open(os.path.join(root_dir, exp, "config.json")) as f: config = json.load(f) y_true = np.load(os.path.join(eval_dir, chkpt, "y_true.npy")) logits = np.load(os.path.join(eval_dir, chkpt, "logits.npy")) groups = np.load(os.path.join(eval_dir, chkpt, "groups.npy")) y_prob = logits_to_probs(logits, config) y_pred = np.argmax(y_prob, axis=1) prob_path = os.path.join(eval_dir, chkpt, "probs.npy") np.save(prob_path, y_prob) logging.info(f"Saved posteriror probabilities to {prob_path}") metrics = dict( count=count, f1=f1_score, recall=recall_score, accuracy=accuracy_score, selection_rate=selection_rate, balanced_accuracy=balanced_accuracy_score, balanced_acc_error=balanced_accuracy_error, false_positive_rate=false_positive_rate, false_positive_error=false_positive_error, false_negative_rate=false_negative_rate, false_negative_error=false_negative_error, ) mf = MetricFrame( metrics=metrics, y_true=y_true, y_pred=y_pred, sensitive_features=groups, ) dpd = demographic_parity_difference( y_true, y_pred, sensitive_features=groups ).item() eq_odds = equalized_odds_difference(y_true, y_pred, sensitive_features=groups) eq_opp = equal_opportunity_difference( y_true, y_pred, sensitive_features=groups ).item() diffs = mf.difference()[ [ "f1", "recall", "accuracy", "balanced_accuracy", "false_positive_rate", "false_negative_rate", ] ] diffs = diffs.rename( dict( f1="f1_diff", recall="recall_diff", accuracy="accuracy_diff", balanced_accuracy="balanced_acc_diff", false_positive_rate="fpr_diff", false_negative_rate="fnr_diff", ) ) # One row for each experiment fair = pd.Series([dpd, eq_odds, eq_opp], index=["dpd", "eq_odds", "eq_opp"]) fair = fair.add(diffs, fill_value=0) result = mf.overall.add(fair, fill_value=0) header = ["experiment"] + result.keys().to_list() if not os.path.isfile(common_csv): with open(common_csv, "a", newline="") as f: writer = csv.writer(f) writer.writerow(header) row = [exp] + result.to_list() with open(common_csv, "a", newline="") as f: writer = csv.writer(f) writer.writerow(row) logging.info(f"Added row to {common_csv}") # Disaggregated metrics group = mf.by_group group = group.rename(columns={"sensitive_feature_0": "group"}) group.to_csv(os.path.join(eval_dir, chkpt, disagg_csv)) logging.info(f"Saved disaggregated metrics to {disagg_csv}") logging.info("Done")