""" Prepare LIDC-IDRI data for deterministic baselines. Creates flat directories with majority-vote merged masks. Also prepares nnU-Net format dataset. """ import os import sys import glob import argparse import numpy as np from PIL import Image from tqdm import tqdm import shutil def majority_vote_mask(mask_paths): """Create majority vote mask from multiple annotator masks (>=2/4 agree).""" masks = [] for p in mask_paths: m = np.array(Image.open(p).convert("L")) m = (m > 127).astype(np.uint8) # Binarize masks.append(m) # Stack and sum: pixel = 1 if >= 2 annotators agree stacked = np.stack(masks, axis=0) vote = (np.sum(stacked, axis=0) >= 2).astype(np.uint8) return vote * 255 # Save as 0/255 PNG def process_split(data_dir, output_dir, split_name): """Process a train or test split.""" images_dir = os.path.join(output_dir, "images") masks_dir = os.path.join(output_dir, "masks") os.makedirs(images_dir, exist_ok=True) os.makedirs(masks_dir, exist_ok=True) # Find all patient directories patient_dirs = sorted(glob.glob(os.path.join(data_dir, "LIDC-IDRI-*"))) count = 0 skipped = 0 for patient_dir in tqdm(patient_dirs, desc=f"Processing {split_name}"): patient_id = os.path.basename(patient_dir) nodule_dirs = sorted(glob.glob(os.path.join(patient_dir, "nodule-*"))) for nodule_dir in nodule_dirs: nodule_id = os.path.basename(nodule_dir) image_files = sorted(glob.glob(os.path.join(nodule_dir, "images", "slice-*.png"))) for img_path in image_files: slice_name = os.path.basename(img_path) # e.g., slice-0.png slice_id = slice_name.replace(".png", "") # e.g., slice-0 # Find all annotator masks for this slice mask_paths = [] for mask_dir in sorted(glob.glob(os.path.join(nodule_dir, "mask-*"))): mask_path = os.path.join(mask_dir, slice_name) if os.path.exists(mask_path): mask_paths.append(mask_path) if len(mask_paths) < 2: skipped += 1 continue # Create output filename: LIDC-IDRI-0001_nodule-0_slice-0 out_name = f"{patient_id}_{nodule_id}_{slice_id}.png" # Copy image shutil.copy2(img_path, os.path.join(images_dir, out_name)) # Create and save majority vote mask mv_mask = majority_vote_mask(mask_paths) Image.fromarray(mv_mask).save(os.path.join(masks_dir, out_name)) count += 1 print(f"{split_name}: Processed {count} slices, skipped {skipped}") return count def prepare_nnunet_format(flat_train_dir, flat_test_dir, nnunet_raw_dir): """Convert flat dataset to nnU-Net v2 format.""" dataset_dir = os.path.join(nnunet_raw_dir, "Dataset001_LIDC") imagesTr = os.path.join(dataset_dir, "imagesTr") labelsTr = os.path.join(dataset_dir, "labelsTr") imagesTs = os.path.join(dataset_dir, "imagesTs") labelsTs = os.path.join(dataset_dir, "labelsTs") for d in [imagesTr, labelsTr, imagesTs, labelsTs]: os.makedirs(d, exist_ok=True) # nnU-Net expects: case_XXXX_0000.png for images, case_XXXX.png for labels # Channel suffix _0000 for single-channel print("Converting to nnU-Net format...") # Training train_images = sorted(glob.glob(os.path.join(flat_train_dir, "images", "*.png"))) for i, img_path in enumerate(tqdm(train_images, desc="nnU-Net train")): basename = os.path.splitext(os.path.basename(img_path))[0] case_id = f"LIDC_{i:05d}" # Copy image with _0000 suffix shutil.copy2(img_path, os.path.join(imagesTr, f"{case_id}_0000.png")) # Copy mask (convert 0/255 to 0/1 for nnU-Net) mask_path = os.path.join(flat_train_dir, "masks", os.path.basename(img_path)) mask = np.array(Image.open(mask_path).convert("L")) mask = (mask > 127).astype(np.uint8) Image.fromarray(mask).save(os.path.join(labelsTr, f"{case_id}.png")) # Testing test_images = sorted(glob.glob(os.path.join(flat_test_dir, "images", "*.png"))) for i, img_path in enumerate(tqdm(test_images, desc="nnU-Net test")): basename = os.path.splitext(os.path.basename(img_path))[0] case_id = f"LIDC_{i:05d}" shutil.copy2(img_path, os.path.join(imagesTs, f"{case_id}_0000.png")) mask_path = os.path.join(flat_test_dir, "masks", os.path.basename(img_path)) mask = np.array(Image.open(mask_path).convert("L")) mask = (mask > 127).astype(np.uint8) Image.fromarray(mask).save(os.path.join(labelsTs, f"{case_id}.png")) # Create dataset.json import json dataset_json = { "channel_names": {"0": "CT"}, "labels": {"background": 0, "nodule": 1}, "numTraining": len(train_images), "file_ending": ".png", "name": "Dataset001_LIDC", "description": "LIDC-IDRI Lung Nodule Segmentation (majority vote GT)", "reference": "LIDC-IDRI", "licence": "CC BY 3.0", "release": "1.0" } with open(os.path.join(dataset_dir, "dataset.json"), "w") as f: json.dump(dataset_json, f, indent=2) # Save mapping from nnU-Net case IDs to original names (for prediction conversion) mapping = {} for i, img_path in enumerate(sorted(glob.glob(os.path.join(flat_test_dir, "images", "*.png")))): case_id = f"LIDC_{i:05d}" original_name = os.path.splitext(os.path.basename(img_path))[0] mapping[case_id] = original_name with open(os.path.join(dataset_dir, "test_case_mapping.json"), "w") as f: json.dump(mapping, f, indent=2) print(f"nnU-Net dataset created at {dataset_dir}") print(f" Training: {len(train_images)} cases") print(f" Testing: {len(test_images)} cases") def main(): parser = argparse.ArgumentParser() parser.add_argument("--data_root", type=str, default="data", help="Root data directory") parser.add_argument("--skip_nnunet", action="store_true", help="Skip nnU-Net format conversion") args = parser.parse_args() train_dir = os.path.join(args.data_root, "training") test_dir = os.path.join(args.data_root, "testing") flat_train = os.path.join(args.data_root, "flat_train") flat_test = os.path.join(args.data_root, "flat_test") print("=" * 60) print("Preparing flat dataset with majority-vote masks") print("=" * 60) n_train = process_split(train_dir, flat_train, "Training") n_test = process_split(test_dir, flat_test, "Testing") print(f"\nTotal: {n_train} train, {n_test} test slices") if not args.skip_nnunet: print("\n" + "=" * 60) print("Preparing nnU-Net format dataset") print("=" * 60) nnunet_raw = os.path.join(args.data_root, "nnUNet_raw") prepare_nnunet_format(flat_train, flat_test, nnunet_raw) print("\nDone!") if __name__ == "__main__": main()