Spaces:

farrell236
/

CFPVesselSeg

Running

App Files Files Community

CFPVesselSeg / datasets /FGADR.py

farrell236

add src

e99a83c 1 day ago

raw

history blame contribute delete

9.8 kB

	from pathlib import Path

	import numpy as np
	import torch
	from torch.utils.data import Dataset, DataLoader
	from PIL import Image
	from sklearn.model_selection import KFold


	class FGADRDataset(Dataset):
	"""
	FGADR Seg-set dataset for diabetic retinopathy lesion segmentation.

	Expected structure:
	Seg-set/
	├── DR_Seg_Grading_Label.csv
	├── Original_Images/
	├── Microaneurysms_Masks/
	├── Hemohedge_Masks/
	├── HardExudate_Masks/
	├── SoftExudate_Masks/
	├── IRMA_Masks/
	└── Neovascularization_Masks/

	CSV format, no header:
	filename,dr_grade

	Output:
	image: [3, H, W]
	label: [6, H, W]
	grade: scalar long tensor
	case_id: filename stem

	split:
	"train" = all folds except selected fold
	"val" = selected fold
	"all" = full dataset

	Notes:
	If a lesion-specific mask file is absent, it is treated as an empty
	all-zero mask, meaning no incidence of that lesion class.
	"""

	lesion_dirs = {
	"microaneurysm": "Microaneurysms_Masks",
	"hemorrhage": "Hemohedge_Masks",
	"hard_exudate": "HardExudate_Masks",
	"soft_exudate": "SoftExudate_Masks",
	"irma": "IRMA_Masks",
	"neovascularization": "Neovascularization_Masks",
	}

	def __init__(
	self,
	root,
	split="train",
	fold=0,
	n_folds=5,
	seed=42,
	transform=None,
	csv_name="DR_Seg_Grading_Label.csv",
	image_dir_name="Original_Images",
	mask_suffix="",
	):
	self.root = Path(root)
	self.split = split
	self.fold = fold
	self.n_folds = n_folds
	self.seed = seed
	self.transform = transform
	self.csv_path = self.root / csv_name
	self.image_dir = self.root / image_dir_name
	self.mask_suffix = mask_suffix

	if split not in ["train", "val", "all"]:
	raise ValueError("split must be one of: 'train', 'val', 'all'")

	if not (0 <= fold < n_folds):
	raise ValueError(f"fold must be in [0, {n_folds - 1}], got {fold}")

	if not self.image_dir.exists():
	raise FileNotFoundError(f"Image directory not found: {self.image_dir}")

	if not self.csv_path.exists():
	raise FileNotFoundError(f"CSV file not found: {self.csv_path}")

	self.class_names = list(self.lesion_dirs.keys())

	for dirname in self.lesion_dirs.values():
	mask_dir = self.root / dirname
	if not mask_dir.exists():
	raise FileNotFoundError(f"Mask directory not found: {mask_dir}")

	all_samples = self._read_csv()

	if len(all_samples) == 0:
	raise RuntimeError(f"No samples found in {self.csv_path}")

	if split == "all":
	self.samples = all_samples
	else:
	kfold = KFold(
	n_splits=n_folds,
	shuffle=True,
	random_state=seed,
	)

	splits = list(kfold.split(all_samples))
	train_indices, val_indices = splits[fold]

	if split == "train":
	self.samples = [all_samples[i] for i in train_indices]
	else:
	self.samples = [all_samples[i] for i in val_indices]

	def _read_csv(self):
	samples = []

	with open(self.csv_path, "r") as f:
	for line in f:
	line = line.strip()

	if not line:
	continue

	parts = line.split(",")

	if len(parts) < 2:
	continue

	filename = parts[0].strip()
	grade = int(parts[1].strip())

	image_path = self.image_dir / filename

	if not image_path.exists():
	raise FileNotFoundError(f"Image not found: {image_path}")

	samples.append(
	{
	"filename": filename,
	"case_id": Path(filename).stem,
	"image_path": image_path,
	"grade": grade,
	}
	)

	return samples

	def __len__(self):
	return len(self.samples)

	def _load_image(self, path):
	image = Image.open(path).convert("RGB")
	return np.array(image)

	def _load_mask(self, path, shape):
	if path.exists():
	mask = Image.open(path).convert("L")
	mask = np.array(mask)
	else:
	mask = np.zeros(shape, dtype=np.uint8)

	return mask

	def _get_mask_path(self, lesion_name, filename):
	mask_dir = self.root / self.lesion_dirs[lesion_name]

	if self.mask_suffix:
	stem = Path(filename).stem
	suffix = Path(filename).suffix
	filename = f"{stem}{self.mask_suffix}{suffix}"

	return mask_dir / filename

	def __getitem__(self, idx):
	sample_info = self.samples[idx]

	filename = sample_info["filename"]
	image_path = sample_info["image_path"]
	case_id = sample_info["case_id"]
	grade = sample_info["grade"]

	image = self._load_image(image_path)
	h, w = image.shape[:2]

	masks = []
	mask_paths = {}

	for lesion_name in self.class_names:
	mask_path = self._get_mask_path(lesion_name, filename)
	mask = self._load_mask(mask_path, shape=(h, w))

	masks.append(mask)
	mask_paths[lesion_name] = str(mask_path)

	if self.transform is not None:
	transformed = self.transform(
	image=image,
	masks=masks,
	)

	image = transformed["image"]
	masks = transformed["masks"]

	masks = [
	m.float() if isinstance(m, torch.Tensor) else torch.from_numpy(m).float()
	for m in masks
	]

	label = torch.stack(masks, dim=0)

	else:
	image = torch.from_numpy(image).permute(2, 0, 1).float() / 255.0
	label = torch.stack(
	[torch.from_numpy(m).float() for m in masks],
	dim=0,
	)

	label = (label > 0).float()

	return {
	"image": image,
	"label": label,
	"grade": torch.tensor(grade, dtype=torch.long),
	"case_id": case_id,
	"filename": filename,
	"image_path": str(image_path),
	"mask_paths": mask_paths,
	}


	if __name__ == "__main__":
	import matplotlib.pyplot as plt
	from tqdm import tqdm

	try:
	from augmentations import get_train_transforms, IMAGENET_MEAN, IMAGENET_STD
	except ImportError:
	import sys

	project_root = Path(__file__).resolve().parents[1]
	sys.path.append(str(project_root))

	from augmentations import get_train_transforms, IMAGENET_MEAN, IMAGENET_STD

	root = "/data/MIDS/datasets/retina/FGADR/Seg-set"
	image_size = 512

	dataset = FGADRDataset(
	root=root,
	split="train",
	fold=0,
	n_folds=5,
	seed=42,
	transform=get_train_transforms(image_size=image_size),
	)

	print("\nChecking all FGADR files...")

	missing_images = 0
	absent_masks = 0

	for sample in tqdm(dataset.samples, desc="Checking files"):
	filename = sample["filename"]

	if not sample["image_path"].exists():
	print(f"Missing image: {sample['image_path']}")
	missing_images += 1

	for lesion_name in dataset.class_names:
	mask_path = dataset._get_mask_path(lesion_name, filename)

	if not mask_path.exists():
	absent_masks += 1

	print("File check complete.")
	print(f"Missing images: {missing_images}")
	print(f"Absent lesion masks treated as empty: {absent_masks}")

	loader = DataLoader(
	dataset,
	batch_size=4,
	shuffle=True,
	num_workers=0,
	)

	batch = next(iter(loader))

	print("\nSmoke test batch:")
	print("Number of samples:", len(dataset))
	print("Split:", dataset.split)
	print("Fold:", dataset.fold)
	print("Number of folds:", dataset.n_folds)
	print("Class names:", dataset.class_names)
	print("Batch keys:", batch.keys())
	print("Image shape:", batch["image"].shape)
	print("Label shape:", batch["label"].shape)
	print("Grade shape:", batch["grade"].shape)
	print("Label min/max:", batch["label"].min().item(), batch["label"].max().item())
	print("Case IDs:", batch["case_id"])

	image = batch["image"][0].cpu()
	label = batch["label"][0].cpu()
	grade = batch["grade"][0].item()

	mean = torch.tensor(IMAGENET_MEAN).view(3, 1, 1)
	std = torch.tensor(IMAGENET_STD).view(3, 1, 1)

	image_vis = image * std + mean
	image_vis = image_vis.clamp(0, 1)
	image_vis = image_vis.permute(1, 2, 0).numpy()

	combined_mask = (label.sum(dim=0) > 0).float().numpy()

	fig, axes = plt.subplots(2, 5, figsize=(20, 8))
	axes = axes.flatten()

	axes[0].imshow(image_vis)
	axes[0].set_title(f"Image \| Grade {grade}")
	axes[0].axis("off")

	axes[1].imshow(combined_mask, cmap="gray")
	axes[1].set_title("Any Lesion")
	axes[1].axis("off")

	axes[2].imshow(image_vis)
	axes[2].imshow(combined_mask, cmap="Reds", alpha=0.45)
	axes[2].set_title("Overlay")
	axes[2].axis("off")

	for ax in axes[3:]:
	ax.axis("off")

	for i, class_name in enumerate(dataset.class_names):
	ax = axes[i + 3]
	ax.imshow(label[i].numpy(), cmap="gray")
	ax.set_title(class_name)
	ax.axis("off")

	plt.tight_layout()
	plt.show()