""" Dataset classes for HAM10000 and PAD-UFES-20. All paths are parameterized — no hardcoded Colab paths. """ import pandas as pd from pathlib import Path from PIL import Image from torch.utils.data import Dataset from sklearn.model_selection import train_test_split CLASS_NAMES = ["akiec", "bcc", "bkl", "df", "mel", "nv", "vasc"] NUM_CLASSES = len(CLASS_NAMES) # Maps PAD-UFES-20 diagnostic labels to HAM10000 class names PADUFES_TO_HAM = {"ACK": "akiec", "BCC": "bcc", "MEL": "mel", "NEV": "nv", "SEK": "bkl"} class HAM10000Dataset(Dataset): """HAM10000 dermoscopic image dataset. Expects directory layout:: data_dir/ train/ akiec/*.jpg bcc/*.jpg ... val/ akiec/*.jpg ... """ def __init__(self, data_dir, split="train", transform=None): self.transform = transform self.class_to_idx = {name: idx for idx, name in enumerate(CLASS_NAMES)} self.samples = [] split_dir = Path(data_dir) / split for class_name in CLASS_NAMES: class_dir = split_dir / class_name if class_dir.exists(): for img_path in class_dir.glob("*.jpg"): self.samples.append( {"path": img_path, "label": self.class_to_idx[class_name], "class_name": class_name} ) def __len__(self): return len(self.samples) def __getitem__(self, idx): sample = self.samples[idx] image = Image.open(sample["path"]).convert("RGB") if self.transform: image = self.transform(image) return image, sample["label"] class PADUFESDataset(Dataset): """PAD-UFES-20 clinical smartphone image dataset. Includes Fitzpatrick skin type metadata for fairness evaluation. Only keeps classes that map to HAM10000 via ``PADUFES_TO_HAM``. """ def __init__(self, data_dir, split="train", transform=None, val_ratio=0.2): self.transform = transform self.class_to_idx = {name: idx for idx, name in enumerate(CLASS_NAMES)} self.data_dir = Path(data_dir) # Find metadata CSV metadata_path = None for p in self.data_dir.rglob("*.csv"): metadata_path = p break if metadata_path is None: raise FileNotFoundError(f"No CSV found in {data_dir}") df = pd.read_csv(metadata_path) diag_col = "diagnostic" if "diagnostic" in df.columns else "dx" df = df[df[diag_col].isin(PADUFES_TO_HAM.keys())].copy() df["ham_class"] = df[diag_col].map(PADUFES_TO_HAM) df["label"] = df["ham_class"].map(self.class_to_idx) train_df, val_df = train_test_split(df, test_size=val_ratio, stratify=df["ham_class"], random_state=42) self.df = train_df if split == "train" else val_df # Find image directories (those with >10 .png files) self.img_dirs = [] for d in self.data_dir.rglob("*"): if d.is_dir() and len(list(d.glob("*.png"))) > 10: self.img_dirs.append(d) img_id_col = "img_id" if "img_id" in df.columns else "image_id" fitz_col = "fitspatrick" if "fitspatrick" in df.columns else None self.samples = [] for _, row in self.df.iterrows(): self.samples.append( { "img_id": row[img_id_col], "label": row["label"], "class_name": row["ham_class"], "fitzpatrick": row[fitz_col] if fitz_col else None, } ) def __len__(self): return len(self.samples) def __getitem__(self, idx): sample = self.samples[idx] img_id = sample["img_id"] img_id_base = img_id.rsplit(".", 1)[0] if img_id.endswith((".png", ".jpg")) else img_id img_path = None for img_dir in self.img_dirs: for ext in [".png", ".PNG", ".jpg"]: candidate = img_dir / f"{img_id_base}{ext}" if candidate.exists(): img_path = candidate break if img_path is None: candidate = img_dir / img_id if candidate.exists(): img_path = candidate if img_path: break if img_path is None: raise FileNotFoundError(f"Image not found: {img_id}") image = Image.open(img_path).convert("RGB") if self.transform: image = self.transform(image) return image, sample["label"] def get_fitzpatrick_groups(self): """Return sample indices grouped by Fitzpatrick skin type.""" groups = {"I-II": [], "III-IV": [], "V-VI": [], "unknown": []} for idx, sample in enumerate(self.samples): fitz = sample["fitzpatrick"] if fitz in [1, 2]: groups["I-II"].append(idx) elif fitz in [3, 4]: groups["III-IV"].append(idx) elif fitz in [5, 6]: groups["V-VI"].append(idx) else: groups["unknown"].append(idx) return groups