""" prepare_data.py — organise raw CBIS-DDSM images into train/val folder structure. If your downloaded images are already in data/train/benign etc., skip this. Usage ----- python prepare_data.py --images /path/to/raw/images --csv /path/to/labels.csv CSV must have columns: file_path, pathology pathology values: BENIGN, MALIGNANT (or benign, malignant) Output ------ data/ train/benign/ train/malignant/ val/benign/ val/malignant/ """ import argparse import os import shutil import random TRAIN_RATIO = 0.85 def prepare(images_dir: str, csv_path: str, output_dir: str, seed: int = 42) -> None: import csv random.seed(seed) records: list[tuple[str, str]] = [] with open(csv_path, newline="") as f: reader = csv.DictReader(f) for row in reader: # normalise label label = row.get("pathology", row.get("label", "")).strip().lower() if label in ("benign", "benign_without_callback"): label = "benign" elif label in ("malignant",): label = "malignant" else: continue # skip unknown labels img_path = os.path.join(images_dir, row.get("file_path", "").strip()) if os.path.isfile(img_path): records.append((img_path, label)) print(f"Found {len(records)} labelled images") random.shuffle(records) split = int(len(records) * TRAIN_RATIO) splits = {"train": records[:split], "val": records[split:]} for split_name, items in splits.items(): for label in ("benign", "malignant"): os.makedirs(os.path.join(output_dir, split_name, label), exist_ok=True) for src, label in items: fname = os.path.basename(src) dst = os.path.join(output_dir, split_name, label, fname) shutil.copy2(src, dst) counts = {lbl: sum(1 for _, l in items if l == lbl) for lbl in ("benign", "malignant")} print(f"{split_name}: {counts}") print(f"Data prepared in {output_dir}/") if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--images", required=True, help="Directory containing raw image files") parser.add_argument("--csv", required=True, help="CSV file with file_path and pathology columns") parser.add_argument("--output", default="data", help="Output directory") args = parser.parse_args() prepare(args.images, args.csv, args.output)