melanoma_classification / src /scripts /get_results.py
Mhara's picture
Upload folder using huggingface_hub
dae5c90 verified
Raw
History Blame Contribute Delete
7.42 kB
import csv
import json
import os
import logging
import numpy as np
import pandas as pd
from scipy.special import softmax
from sklearn.metrics import (
confusion_matrix,
recall_score,
f1_score,
accuracy_score,
balanced_accuracy_score,
)
from fairlearn.metrics import (
count,
false_positive_rate,
false_negative_rate,
selection_rate,
demographic_parity_difference,
equal_opportunity_difference,
equalized_odds_difference,
)
from fairlearn.metrics import MetricFrame
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
handlers=[logging.StreamHandler()],
)
def logits_to_probs(logits, config):
# Posterioir probabilities are calculated differently in some experiments
if config.get("domain_independent_loss", False):
per_group = np.split(logits, config["num_groups"], axis=1)
marginalized = np.sum(per_group, axis=0)
return softmax(marginalized, axis=1)
if config.get("domain_discriminative_loss", False):
# Prior shift inference, train distribution
prior_shift_weight = np.array(
[
1088/1072, 1088/16, 17746/17515, 17746/231, 6454/6273, 6454/181, 850/834, 850/16
]
) / 100
probs_yd = softmax(logits, axis=1) * prior_shift_weight
per_group = np.split(probs_yd, config["num_groups"], axis=1)
marginalized = np.sum(per_group, axis=0)
# We shifted probs, apply softmax once more
return softmax(marginalized, axis=1)
return softmax(logits, axis=1)
# Fairlearn docs
def compute_error_metric(metric_value, sample_size):
"""Compute standard error of a given metric based on the assumption of
normal distribution.
Parameters:
metric_value: Value of the metric
sample_size: Number of data points associated with the metric
Returns:
The standard error of the metric
"""
metric_value = metric_value / sample_size
return 1.96 * np.sqrt(metric_value * (1.0 - metric_value)) / np.sqrt(sample_size)
def false_positive_error(y_true, y_pred):
"""Compute the standard error for the false positive rate estimate."""
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
return compute_error_metric(fp, tn + fp)
def false_negative_error(y_true, y_pred):
"""Compute the standard error for the false negative rate estimate."""
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
return compute_error_metric(fn, fn + tp)
def balanced_accuracy_error(y_true, y_pred):
"""Compute the standard error for the balanced accuracy estimate."""
fpr_error, fnr_error = false_positive_error(y_true, y_pred), false_negative_error(
y_true, y_pred
)
return np.sqrt(fnr_error**2 + fpr_error**2) / 2
if __name__ == "__main__":
root_dir = "C:\\Users\\Duje\\Desktop\\fer\\8. semestar\\lumen\\rezultati\\02 eksperimenti\\"
common_csv = "rezultati.csv"
disagg_csv = "disaggregated.csv"
experiments = [
"01 baseline 0304",
"02 recall ce 0304",
"04 cielab re based",
"05 cielab ohem",
"08 optim params large",
"10 transformer\\normal",
"11 transformer ohem",
"12 domain discriminative\\new",
"13 oversampler",
"15 focal loss\\new",
"14 domain independent\\new",
"16 efficient m\\new",
"17 masked\\new",
"18 efficient l\\new",
"19 oversampler trio\\1 base",
"19 oversampler trio\\2 ifw, recall_ce",
"19 oversampler trio\\3 ifw, ohem",
"20 dino\\new",
"21 dino oversample",
"22 dino undersample",
"23 long train 04"
"24 dd transformer"
]
logging.info(f"Collecting metrics for {len(experiments)} experiments")
for exp in experiments:
eval_dir = os.path.join(root_dir, exp, "eval")
chkpt = next(os.walk(eval_dir))[1][0] # checkpoint folder
logging.info(f"Evaluating checkpoint {chkpt} for experiment {exp}")
with open(os.path.join(root_dir, exp, "config.json")) as f:
config = json.load(f)
y_true = np.load(os.path.join(eval_dir, chkpt, "y_true.npy"))
logits = np.load(os.path.join(eval_dir, chkpt, "logits.npy"))
groups = np.load(os.path.join(eval_dir, chkpt, "groups.npy"))
y_prob = logits_to_probs(logits, config)
y_pred = np.argmax(y_prob, axis=1)
prob_path = os.path.join(eval_dir, chkpt, "probs.npy")
np.save(prob_path, y_prob)
logging.info(f"Saved posteriror probabilities to {prob_path}")
metrics = dict(
count=count,
f1=f1_score,
recall=recall_score,
accuracy=accuracy_score,
selection_rate=selection_rate,
balanced_accuracy=balanced_accuracy_score,
balanced_acc_error=balanced_accuracy_error,
false_positive_rate=false_positive_rate,
false_positive_error=false_positive_error,
false_negative_rate=false_negative_rate,
false_negative_error=false_negative_error,
)
mf = MetricFrame(
metrics=metrics,
y_true=y_true,
y_pred=y_pred,
sensitive_features=groups,
)
dpd = demographic_parity_difference(
y_true, y_pred, sensitive_features=groups
).item()
eq_odds = equalized_odds_difference(y_true, y_pred, sensitive_features=groups)
eq_opp = equal_opportunity_difference(
y_true, y_pred, sensitive_features=groups
).item()
diffs = mf.difference()[
[
"f1",
"recall",
"accuracy",
"balanced_accuracy",
"false_positive_rate",
"false_negative_rate",
]
]
diffs = diffs.rename(
dict(
f1="f1_diff",
recall="recall_diff",
accuracy="accuracy_diff",
balanced_accuracy="balanced_acc_diff",
false_positive_rate="fpr_diff",
false_negative_rate="fnr_diff",
)
)
# One row for each experiment
fair = pd.Series([dpd, eq_odds, eq_opp], index=["dpd", "eq_odds", "eq_opp"])
fair = fair.add(diffs, fill_value=0)
result = mf.overall.add(fair, fill_value=0)
header = ["experiment"] + result.keys().to_list()
if not os.path.isfile(common_csv):
with open(common_csv, "a", newline="") as f:
writer = csv.writer(f)
writer.writerow(header)
row = [exp] + result.to_list()
with open(common_csv, "a", newline="") as f:
writer = csv.writer(f)
writer.writerow(row)
logging.info(f"Added row to {common_csv}")
# Disaggregated metrics
group = mf.by_group
group = group.rename(columns={"sensitive_feature_0": "group"})
group.to_csv(os.path.join(eval_dir, chkpt, disagg_csv))
logging.info(f"Saved disaggregated metrics to {disagg_csv}")
logging.info("Done")