cxr-vlm-code / dev /eval_labelers.py
convitom
chore(data): WIP EDA notebooks + labeler comparison tooling
21fa652
"""
Compare CheXpert labeler vs NegBio labeler against manual ground truth labels
from MIMIC-CXR-JPG test set.
"""
import pandas as pd
import numpy as np
from sklearn.metrics import (
accuracy_score, f1_score, precision_score, recall_score,
classification_report,
)
# ── Cấu hình — chỉnh 4 dòng này ──────────────────────────────────────────────
CHEXPERT_PATH = r"D:\USTH\KLTN\cxr-vlm-data\mimic-cxr-2.0.0-chexpert.csv"
NEGBIO_PATH = r"D:\USTH\KLTN\cxr-vlm-data\mimic-cxr-2.0.0-negbio.csv"
GT_PATH = r"D:\USTH\KLTN\cxr-vlm-data\mimic-cxr-2.1.0-test-set-labeled.csv"
# Cách xử lý nhãn uncertain (-1):
# "positive" → coi là có bệnh (mặc định, conservative)
# "negative" → coi là không có bệnh
# "drop" → bỏ hẳn các study có uncertain
UNCERTAIN_STRATEGY = "negative"
# ─────────────────────────────────────────────────────────────────────────────
PATHOLOGIES = [
"Atelectasis", "Cardiomegaly", "Consolidation", "Edema",
"Enlarged Cardiomediastinum", "Fracture", "Lung Lesion", "Lung Opacity",
"No Finding", "Pleural Effusion", "Pleural Other", "Pneumonia",
"Pneumothorax", "Support Devices",
]
def load_labels(path: str) -> pd.DataFrame:
if path.endswith(".gz"):
df = pd.read_csv(path, compression="gzip")
else:
df = pd.read_csv(path)
return df
def resolve_uncertain(df: pd.DataFrame, cols: list[str], strategy: str) -> pd.DataFrame:
df = df.copy()
if strategy == "positive":
# -1 (uncertain) → 1 (treat as positive, conservative)
for c in cols:
df[c] = df[c].replace(-1.0, 1.0)
elif strategy == "negative":
# -1 (uncertain) → 0 (treat as negative)
for c in cols:
df[c] = df[c].replace(-1.0, 0.0)
elif strategy == "drop":
# drop rows that have any uncertain label
mask = (df[cols] == -1.0).any(axis=1)
df = df[~mask]
else:
raise ValueError(f"Unknown strategy: {strategy}")
# NaN (not mentioned) → 0
df[cols] = df[cols].fillna(0.0).clip(0, 1).astype(int)
return df
def available_pathologies(df: pd.DataFrame) -> list[str]:
return [p for p in PATHOLOGIES if p in df.columns]
def compute_metrics(y_true: np.ndarray, y_pred: np.ndarray, label: str) -> dict:
return {
"tool": label,
"macro_f1": f1_score(y_true, y_pred, average="macro", zero_division=0),
"micro_f1": f1_score(y_true, y_pred, average="micro", zero_division=0),
"accuracy": accuracy_score(y_true, y_pred),
"macro_prec": precision_score(y_true, y_pred, average="macro", zero_division=0),
"macro_rec": recall_score(y_true, y_pred, average="macro", zero_division=0),
}
def per_pathology_f1(y_true: np.ndarray, y_pred: np.ndarray, cols: list[str]) -> pd.Series:
scores = {}
for i, c in enumerate(cols):
scores[c] = f1_score(y_true[:, i], y_pred[:, i], zero_division=0)
return pd.Series(scores)
def main():
print("Loading files...")
gt = load_labels(GT_PATH)
chx = load_labels(CHEXPERT_PATH)
neg = load_labels(NEGBIO_PATH)
print(f" Ground truth : {len(gt):,} studies")
print(f" CheXpert : {len(chx):,} studies")
print(f" NegBio : {len(neg):,} studies")
# Align on study_id
gt = gt.set_index("study_id")
chx = chx.set_index("study_id")
neg = neg.set_index("study_id")
common_idx = gt.index.intersection(chx.index).intersection(neg.index)
print(f"\n Studies in all three : {len(common_idx):,}")
gt = gt.loc[common_idx]
chx = chx.loc[common_idx]
neg = neg.loc[common_idx]
# Determine shared pathology columns
cols = [p for p in PATHOLOGIES if p in gt.columns and p in chx.columns and p in neg.columns]
print(f" Pathologies evaluated: {len(cols)}")
print(f" {cols}\n")
# Ground truth: NaN → 0, clip to binary
gt_clean = gt[cols].fillna(0.0).clip(0, 1).astype(int)
# Auto-labelers: resolve uncertain then binarise
chx_clean = resolve_uncertain(chx[cols].copy(), cols, UNCERTAIN_STRATEGY)
neg_clean = resolve_uncertain(neg[cols].copy(), cols, UNCERTAIN_STRATEGY)
# If strategy==drop, gt must also be filtered to matching rows
if UNCERTAIN_STRATEGY == "drop":
shared = chx_clean.index.intersection(neg_clean.index)
gt_clean = gt_clean.loc[shared]
chx_clean = chx_clean.loc[shared]
neg_clean = neg_clean.loc[shared]
print(f" After dropping uncertain rows: {len(shared):,} studies remain\n")
Y_true = gt_clean.values
Y_chx = chx_clean.values
Y_neg = neg_clean.values
# ── Overall metrics ────────────────────────────────────────────────────────
res_chx = compute_metrics(Y_true, Y_chx, "CheXpert")
res_neg = compute_metrics(Y_true, Y_neg, "NegBio")
summary = pd.DataFrame([res_chx, res_neg]).set_index("tool")
print("=" * 60)
print("OVERALL METRICS (uncertain strategy: '{}')".format(UNCERTAIN_STRATEGY))
print("=" * 60)
print(summary.to_string(float_format="{:.4f}".format))
winner = "CheXpert" if res_chx["macro_f1"] > res_neg["macro_f1"] else "NegBio"
diff = abs(res_chx["macro_f1"] - res_neg["macro_f1"])
print(f"\n→ Better labeler (macro-F1): {winner} (Δ = {diff:.4f})")
# ── Per-pathology F1 ──────────────────────────────────────────────────────
f1_chx = per_pathology_f1(Y_true, Y_chx, cols)
f1_neg = per_pathology_f1(Y_true, Y_neg, cols)
per_path = pd.DataFrame({
"CheXpert_F1": f1_chx,
"NegBio_F1": f1_neg,
"Winner": np.where(f1_chx >= f1_neg, "CheXpert", "NegBio"),
"Δ": (f1_chx - f1_neg).round(4),
})
print("\n" + "=" * 60)
print("PER-PATHOLOGY F1")
print("=" * 60)
print(per_path.to_string(float_format="{:.4f}".format))
chx_wins = (per_path["Winner"] == "CheXpert").sum()
neg_wins = (per_path["Winner"] == "NegBio").sum()
print(f"\nPathology wins → CheXpert: {chx_wins} | NegBio: {neg_wins}")
# ── Detailed classification report ────────────────────────────────────────
print("\n" + "=" * 60)
print("CLASSIFICATION REPORT — CheXpert")
print("=" * 60)
print(classification_report(Y_true, Y_chx, target_names=cols, zero_division=0))
print("=" * 60)
print("CLASSIFICATION REPORT — NegBio")
print("=" * 60)
print(classification_report(Y_true, Y_neg, target_names=cols, zero_division=0))
# ── Save results ──────────────────────────────────────────────────────────
out_path = "dev/labeler_comparison.csv"
per_path.to_csv(out_path)
print(f"Per-pathology results saved to {out_path}")
if __name__ == "__main__":
main()