File size: 7,421 Bytes

"""
Compare CheXpert labeler vs NegBio labeler against manual ground truth labels
from MIMIC-CXR-JPG test set.
"""

import pandas as pd
import numpy as np
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score,
    classification_report,
)

# ── Cấu hình — chỉnh 4 dòng này ──────────────────────────────────────────────
CHEXPERT_PATH = r"D:\USTH\KLTN\cxr-vlm-data\mimic-cxr-2.0.0-chexpert.csv"
NEGBIO_PATH   = r"D:\USTH\KLTN\cxr-vlm-data\mimic-cxr-2.0.0-negbio.csv"
GT_PATH       = r"D:\USTH\KLTN\cxr-vlm-data\mimic-cxr-2.1.0-test-set-labeled.csv"

# Cách xử lý nhãn uncertain (-1):
#   "positive" → coi là có bệnh (mặc định, conservative)
#   "negative" → coi là không có bệnh
#   "drop"     → bỏ hẳn các study có uncertain
UNCERTAIN_STRATEGY = "negative"
# ─────────────────────────────────────────────────────────────────────────────

PATHOLOGIES = [
    "Atelectasis", "Cardiomegaly", "Consolidation", "Edema",
    "Enlarged Cardiomediastinum", "Fracture", "Lung Lesion", "Lung Opacity",
    "No Finding", "Pleural Effusion", "Pleural Other", "Pneumonia",
    "Pneumothorax", "Support Devices",
]


def load_labels(path: str) -> pd.DataFrame:
    if path.endswith(".gz"):
        df = pd.read_csv(path, compression="gzip")
    else:
        df = pd.read_csv(path)
    return df


def resolve_uncertain(df: pd.DataFrame, cols: list[str], strategy: str) -> pd.DataFrame:
    df = df.copy()
    if strategy == "positive":
        # -1 (uncertain) → 1 (treat as positive, conservative)
        for c in cols:
            df[c] = df[c].replace(-1.0, 1.0)
    elif strategy == "negative":
        # -1 (uncertain) → 0 (treat as negative)
        for c in cols:
            df[c] = df[c].replace(-1.0, 0.0)
    elif strategy == "drop":
        # drop rows that have any uncertain label
        mask = (df[cols] == -1.0).any(axis=1)
        df = df[~mask]
    else:
        raise ValueError(f"Unknown strategy: {strategy}")
    # NaN (not mentioned) → 0
    df[cols] = df[cols].fillna(0.0).clip(0, 1).astype(int)
    return df


def available_pathologies(df: pd.DataFrame) -> list[str]:
    return [p for p in PATHOLOGIES if p in df.columns]


def compute_metrics(y_true: np.ndarray, y_pred: np.ndarray, label: str) -> dict:
    return {
        "tool": label,
        "macro_f1":    f1_score(y_true, y_pred, average="macro",  zero_division=0),
        "micro_f1":    f1_score(y_true, y_pred, average="micro",  zero_division=0),
        "accuracy":    accuracy_score(y_true, y_pred),
        "macro_prec":  precision_score(y_true, y_pred, average="macro",  zero_division=0),
        "macro_rec":   recall_score(y_true, y_pred, average="macro",  zero_division=0),
    }


def per_pathology_f1(y_true: np.ndarray, y_pred: np.ndarray, cols: list[str]) -> pd.Series:
    scores = {}
    for i, c in enumerate(cols):
        scores[c] = f1_score(y_true[:, i], y_pred[:, i], zero_division=0)
    return pd.Series(scores)


def main():
    print("Loading files...")
    gt  = load_labels(GT_PATH)
    chx = load_labels(CHEXPERT_PATH)
    neg = load_labels(NEGBIO_PATH)

    print(f"  Ground truth : {len(gt):,} studies")
    print(f"  CheXpert     : {len(chx):,} studies")
    print(f"  NegBio       : {len(neg):,} studies")

    # Align on study_id
    gt  = gt.set_index("study_id")
    chx = chx.set_index("study_id")
    neg = neg.set_index("study_id")

    common_idx = gt.index.intersection(chx.index).intersection(neg.index)
    print(f"\n  Studies in all three : {len(common_idx):,}")

    gt  = gt.loc[common_idx]
    chx = chx.loc[common_idx]
    neg = neg.loc[common_idx]

    # Determine shared pathology columns
    cols = [p for p in PATHOLOGIES if p in gt.columns and p in chx.columns and p in neg.columns]
    print(f"  Pathologies evaluated: {len(cols)}")
    print(f"    {cols}\n")

    # Ground truth: NaN → 0, clip to binary
    gt_clean = gt[cols].fillna(0.0).clip(0, 1).astype(int)

    # Auto-labelers: resolve uncertain then binarise
    chx_clean = resolve_uncertain(chx[cols].copy(), cols, UNCERTAIN_STRATEGY)
    neg_clean  = resolve_uncertain(neg[cols].copy(), cols, UNCERTAIN_STRATEGY)

    # If strategy==drop, gt must also be filtered to matching rows
    if UNCERTAIN_STRATEGY == "drop":
        shared = chx_clean.index.intersection(neg_clean.index)
        gt_clean  = gt_clean.loc[shared]
        chx_clean = chx_clean.loc[shared]
        neg_clean  = neg_clean.loc[shared]
        print(f"  After dropping uncertain rows: {len(shared):,} studies remain\n")

    Y_true = gt_clean.values
    Y_chx  = chx_clean.values
    Y_neg  = neg_clean.values

    # ── Overall metrics ────────────────────────────────────────────────────────
    res_chx = compute_metrics(Y_true, Y_chx, "CheXpert")
    res_neg = compute_metrics(Y_true, Y_neg, "NegBio")

    summary = pd.DataFrame([res_chx, res_neg]).set_index("tool")
    print("=" * 60)
    print("OVERALL METRICS (uncertain strategy: '{}')".format(UNCERTAIN_STRATEGY))
    print("=" * 60)
    print(summary.to_string(float_format="{:.4f}".format))

    winner = "CheXpert" if res_chx["macro_f1"] > res_neg["macro_f1"] else "NegBio"
    diff   = abs(res_chx["macro_f1"] - res_neg["macro_f1"])
    print(f"\n→ Better labeler (macro-F1): {winner}  (Δ = {diff:.4f})")

    # ── Per-pathology F1 ──────────────────────────────────────────────────────
    f1_chx = per_pathology_f1(Y_true, Y_chx, cols)
    f1_neg = per_pathology_f1(Y_true, Y_neg, cols)

    per_path = pd.DataFrame({
        "CheXpert_F1": f1_chx,
        "NegBio_F1":   f1_neg,
        "Winner":      np.where(f1_chx >= f1_neg, "CheXpert", "NegBio"),
        "Δ":           (f1_chx - f1_neg).round(4),
    })

    print("\n" + "=" * 60)
    print("PER-PATHOLOGY F1")
    print("=" * 60)
    print(per_path.to_string(float_format="{:.4f}".format))

    chx_wins = (per_path["Winner"] == "CheXpert").sum()
    neg_wins = (per_path["Winner"] == "NegBio").sum()
    print(f"\nPathology wins → CheXpert: {chx_wins}  |  NegBio: {neg_wins}")

    # ── Detailed classification report ────────────────────────────────────────
    print("\n" + "=" * 60)
    print("CLASSIFICATION REPORT — CheXpert")
    print("=" * 60)
    print(classification_report(Y_true, Y_chx, target_names=cols, zero_division=0))

    print("=" * 60)
    print("CLASSIFICATION REPORT — NegBio")
    print("=" * 60)
    print(classification_report(Y_true, Y_neg, target_names=cols, zero_division=0))

    # ── Save results ──────────────────────────────────────────────────────────
    out_path = "dev/labeler_comparison.csv"
    per_path.to_csv(out_path)
    print(f"Per-pathology results saved to {out_path}")


if __name__ == "__main__":
    main()