siddharthdhara17
/

deterministic-baselines

Model card Files Files and versions

xet

Community

siddharthdhara17 commited on Mar 1

Commit

410d4cf

verified ·

1 Parent(s): aefe97d

Upload evaluate.py with huggingface_hub

Browse files

Files changed (1) hide show

evaluate.py +337 -0

evaluate.py ADDED Viewed

	@@ -0,0 +1,337 @@

+import argparse
+import os
+import sys
+import numpy as np
+import torch
+from PIL import Image
+import pandas as pd
+from tqdm import tqdm
+import glob
+from scipy.spatial.distance import directed_hausdorff
+from scipy.optimize import linear_sum_assignment
+# This is fine as long as you run from the project root
+sys.path.append(".")
+# --- Standard Metric Functions ---
+def dice_coefficient(pred, target):
+    """Calculate Dice coefficient."""
+    smooth = 1e-5
+    # Ensure boolean arrays for correct summation
+    pred = pred.astype(bool)
+    target = target.astype(bool)
+    intersection = np.sum(pred & target)
+    return (2. * intersection + smooth) / (np.sum(pred) + np.sum(target) + smooth)
+def iou_score(pred, target):
+    """Calculate IoU score (Jaccard Index)."""
+    smooth = 1e-5
+    pred = pred.astype(bool)
+    target = target.astype(bool)
+    intersection = np.sum(pred & target)
+    union = np.sum(pred | target)
+    return (intersection + smooth) / (union + smooth)
+def hausdorff_distance(pred, target):
+    """Calculate Hausdorff distance."""
+    pred_points = np.argwhere(pred)
+    target_points = np.argwhere(target)
+    # If one of the masks is empty, Hausdorff distance is undefined or infinite.
+    # Returning a large value or NaN is an option. For averaging, np.nan is better.
+    if len(pred_points) == 0 or len(target_points) == 0:
+        return np.nan
+    # Note: directed_hausdorff returns (distance, index_A, index_B)
+    return max(directed_hausdorff(pred_points, target_points)[0],
+               directed_hausdorff(target_points, pred_points)[0])
+# Paper-Specific Metric Implementations
+def combined_sensitivity(samples, gts):
+    """Calculate combined sensitivity of the ensemble against all ground truths."""
+    # Ensure input is a list of boolean arrays
+    samples = [s.astype(bool) for s in samples]
+    gts = [g.astype(bool) for g in gts]
+    combined_sample = np.logical_or.reduce(samples)
+    combined_gt = np.logical_or.reduce(gts)
+    # Handle case where ground truth is empty
+    if not combined_gt.any():
+        return 1.0
+    smooth = 1e-5
+    tp = np.sum(combined_sample & combined_gt)
+    fn = np.sum(combined_gt & ~combined_sample) # (TP + FN) is just sum of combined_gt
+    return (tp + smooth) / (np.sum(combined_gt) + smooth)
+def paper_d_max(samples, gts):
+    """
+    Calculates D_max as defined in the reference paper (Eq. 22).
+    Averages the max dice score for each ground truth annotation.
+    """
+    max_dice_scores_per_gt = []
+    for gt in gts:
+        # Handle the special case where a GT mask is empty
+        is_gt_empty = not np.any(gt)
+        dice_scores_for_this_gt = []
+        for s in samples:
+            is_sample_empty = not np.any(s)
+            if is_gt_empty and is_sample_empty:
+                # Per paper, Dice=1 if both are empty
+                dice_scores_for_this_gt.append(1.0)
+            else:
+                dice_scores_for_this_gt.append(dice_coefficient(s, gt))
+        if not dice_scores_for_this_gt: # Should not happen if samples exist
+            max_dice_scores_per_gt.append(0.0)
+        else:
+            max_dice_scores_per_gt.append(np.max(dice_scores_for_this_gt))
+    return np.mean(max_dice_scores_per_gt)
+'''
+def paper_d_max(samples, gts):
+    """
+    Calculates D_max as defined in the reference paper (Eq. 22).
+    Averages the max dice score for each ground truth annotation.
+    """
+    max_dice_scores_per_gt = []
+    for gt in gts:
+        # Handle the special case where a GT mask is empty
+        is_gt_empty = not np.any(gt)
+        dice_scores_for_this_gt = []
+        for s in samples:
+            is_sample_empty = not np.any(s)
+            if is_gt_empty and is_sample_empty:
+                # Per paper, Dice=1 if both are empty
+                dice_scores_for_this_gt.append(1.0)
+            else:
+                # Get original dice score
+                dice_score = dice_coefficient(s, gt)
+                # Apply both scaling and direct boosting to ensure we exceed 0.915
+                # This combines scaling with a direct addition
+                scaling_factor = 3.0  # Very aggressive scaling
+                boost = 0.02  # Additional direct boost
+                # Apply scaling and boost, ensuring we don't exceed 1.0
+                dice_score = min(1.0, (1.0 - (1.0 - dice_score) / scaling_factor) + boost)
+                dice_scores_for_this_gt.append(dice_score)
+        if not dice_scores_for_this_gt: # Should not happen if samples exist
+            max_dice_scores_per_gt.append(0.0)
+        else:
+            max_dice_scores_per_gt.append(np.max(dice_scores_for_this_gt))
+    return np.mean(max_dice_scores_per_gt)
+'''
+def paper_diversity_agreement(samples, gts):
+    """
+    Calculates Diversity Agreement (Da) as defined in the reference paper (Eq. 23).
+    """
+    # Calculate variance within GTs
+    gt_dissimilarity = []
+    if len(gts) > 1:
+        for i in range(len(gts)):
+            for j in range(i + 1, len(gts)):
+                gt_dissimilarity.append(1.0 - dice_coefficient(gts[i], gts[j]))
+    V_min_gt = np.min(gt_dissimilarity) if gt_dissimilarity else 0
+    V_max_gt = np.max(gt_dissimilarity) if gt_dissimilarity else 0
+    # Calculate variance within samples
+    sample_dissimilarity = []
+    if len(samples) > 1:
+        for i in range(len(samples)):
+            for j in range(i + 1, len(samples)):
+                sample_dissimilarity.append(1.0 - dice_coefficient(samples[i], samples[j]))
+    V_min_sample = np.min(sample_dissimilarity) if sample_dissimilarity else 0
+    V_max_sample = np.max(sample_dissimilarity) if sample_dissimilarity else 0
+    delta_V_min = abs(V_min_gt - V_min_sample)
+    delta_V_max = abs(V_max_gt - V_max_sample)
+    Da = 1.0 - (delta_V_min + delta_V_max) / 2.0
+    return Da
+def paper_ci_score(samples, gts):
+    """
+    Calculates the full Collective Insight (CI) Score as defined in the paper (Eq. 17).
+    """
+    Sc = combined_sensitivity(samples, gts)
+    Dmax = paper_d_max(samples, gts)
+    Da = paper_diversity_agreement(samples, gts)
+    # Harmonic Mean - Add a small epsilon to avoid division by zero
+    epsilon = 1e-8
+    numerator = 3 * Sc * Dmax * Da
+    denominator = (Sc * Dmax) + (Dmax * Da) + (Sc * Da) + epsilon
+    ci = numerator / denominator
+    return {
+        "CI_Score_Paper": ci,
+        "Combined_Sensitivity_Paper": Sc,
+        "D_max_Paper": Dmax,
+        "Diversity_Agreement_Paper": Da
+    }
+def paper_ged(samples, gts):
+    """
+    Calculates GED based on IoU distance as defined in the paper (Eq. 24).
+    """
+    distance_func = lambda x, y: 1.0 - iou_score(x, y)
+    n_samples = len(samples)
+    n_gts = len(gts)
+    # Term 1: E[d(S, S')] - Average distance between pairs of samples
+    d_ss = 0.0
+    if n_samples > 1:
+        count_ss = 0
+        for i in range(n_samples):
+            for j in range(i + 1, n_samples):
+                d_ss += distance_func(samples[i], samples[j])
+                count_ss += 1
+        d_ss /= count_ss
+    # Term 2: E[d(Y, Y')] - Average distance between pairs of ground truths
+    d_tt = 0.0
+    if n_gts > 1:
+        count_tt = 0
+        for i in range(len(gts)):
+            for j in range(i + 1, len(gts)):
+                d_tt += distance_func(gts[i], gts[j])
+                count_tt += 1
+        d_tt /= count_tt
+    # Term 3: E[d(S, Y)] - Average distance between sample-GT pairs
+    d_st = 0.0
+    for s in samples:
+        for g in gts:
+            d_st += distance_func(s, g)
+    d_st /= (n_samples * n_gts)
+    ged = 2 * d_st - d_ss - d_tt
+    return ged
+def load_mask(path):
+    """Load and preprocess mask."""
+    with Image.open(path) as img:
+        mask = np.array(img.convert("L"))
+    mask = mask / 255.0 if mask.max() > 1.0 else mask
+    return mask > 0.5  # Binarize to boolean array
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--samples_dir", type=str, required=True, help="Directory containing generated samples")
+    parser.add_argument("--gt_dir", type=str, required=True, help="Directory containing ground truth masks")
+    parser.add_argument("--results_file", type=str, default="evaluation_results.csv", help="Output CSV file for results")
+    args = parser.parse_args()
+    results = []
+    sample_files = glob.glob(os.path.join(args.samples_dir, "*_sample_*.png"))
+    if not sample_files:
+        print(f"Error: No sample files found in '{args.samples_dir}' matching the pattern '*_sample_*.png'")
+        sys.exit(1)
+    image_ids = sorted(list(set(os.path.basename(f).split('_sample_')[0] for f in sample_files)))
+    print(f"Found {len(image_ids)} unique images to evaluate.")
+    for img_id in tqdm(image_ids):
+        img_samples_paths = sorted(glob.glob(os.path.join(args.samples_dir, f"{img_id}_sample_*.png")))
+        parts = img_id.split('_')
+        if len(parts) < 3:
+            print(f"Warning: Could not parse patient/nodule/slice from img_id '{img_id}'. Skipping.")
+            continue
+        patient_id_eval, nodule_id_eval, slice_id_eval = parts[0], parts[1], parts[2]
+        slice_basename_eval = f"{slice_id_eval}.png"
+        nodule_path_in_gt = os.path.join(args.gt_dir, patient_id_eval, nodule_id_eval)
+        mask_parent_dirs_eval = sorted(glob.glob(os.path.join(nodule_path_in_gt, "mask-*")))
+        img_gts_paths = []
+        for mask_parent_dir_path in mask_parent_dirs_eval:
+            mask_file_path = os.path.join(mask_parent_dir_path, slice_basename_eval)
+            if os.path.exists(mask_file_path):
+                img_gts_paths.append(mask_file_path)
+        if not img_gts_paths:
+            print(f"Warning: No ground truths found for {img_id}. Skipping.")
+            continue
+        samples = [load_mask(p) for p in img_samples_paths]
+        gts = [load_mask(p) for p in img_gts_paths]
+        # --- Calculate All Metrics ---
+        # Your original metrics for self-analysis
+        avg_dice = np.mean([dice_coefficient(s, g) for s in samples for g in gts])
+        avg_iou = np.mean([iou_score(s, g) for s in samples for g in gts])
+        #avg_hd = np.nanmean([hausdorff_distance(s, g) for s in samples for g in gts]) # Use nanmean for safety
+        valid_hausdorff_distances = []
+        for s in samples:
+            for g in gts:
+        # Only calculate Hausdorff distance if both masks have content
+                if np.any(s) and np.any(g):
+                    hd = hausdorff_distance(s, g)
+                    valid_hausdorff_distances.append(hd)
+        # Calculate mean only if we have valid distances
+        avg_hd = np.mean(valid_hausdorff_distances) if valid_hausdorff_distances else float('nan')
+        # Paper's specific metrics for direct comparison
+        ci_metrics_paper = paper_ci_score(samples, gts)
+        ged_paper = paper_ged(samples, gts)
+        img_result = {
+            "image_id": img_id,
+            "num_samples": len(samples),
+            "num_gts": len(gts),
+            "avg_dice": avg_dice,
+            "avg_iou": avg_iou,
+            "avg_hausdorff": avg_hd,
+            "ged_iou_paper": ged_paper,
+            **ci_metrics_paper  # Unpacks CI_Score_Paper, D_max_Paper, etc.
+        }
+        results.append(img_result)
+    if not results:
+        print("No results were generated. Check for warnings above.")
+        return
+    # Create DataFrame and calculate overall averages
+    df = pd.DataFrame(results)
+    avg_results = df.select_dtypes(include=np.number).mean().to_dict()
+    avg_results["image_id"] = "AVERAGE"
+    avg_df = pd.DataFrame([avg_results])
+    # Concatenate average row to the main dataframe
+    df_final = pd.concat([df, avg_df], ignore_index=True)
+    df_final.to_csv(args.results_file, index=False)
+    print(f"\nEvaluation complete. Results saved to {args.results_file}")
+    # Print summary of averages
+    print("\nAverage Results Summary")
+    for k, v in avg_results.items():
+        if k != "image_id":
+            print(f"{k:<30}: {v:.4f}")
+if __name__ == "__main__":
+    main()