Spaces:

farrell236
/

CFPVesselSeg

Running

App Files Files Community

farrell236 commited on 3 days ago

Commit

e99a83c

1 Parent(s): 57d0fed

add src

Browse files

Files changed (15) hide show

.gitignore +1 -0
app.py +294 -0
augmentations.py +138 -0
checkpoints/fives_resunet/best.pt +3 -0
datasets/DRIVE.py +223 -0
datasets/FGADR.py +342 -0
datasets/FIVES.py +215 -0
datasets/__init__.py +0 -0
losses.py +135 -0
models/__init__.py +66 -0
models/deeplabv3.py +113 -0
models/unet.py +205 -0
models/vit.py +223 -0
requirements.txt +12 -0
train.py +256 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ .idea

app.py ADDED Viewed

	@@ -0,0 +1,294 @@

+import argparse
+from pathlib import Path
+import cv2
+import gradio as gr
+import numpy as np
+import torch
+from PIL import Image
+from augmentations import IMAGENET_MEAN, IMAGENET_STD
+from models import build_model
+APP_STATE = {}
+def load_model(args, device):
+    model = build_model(
+        model_name=args.model,
+        num_classes=1,
+        in_channels=3,
+        image_size=args.image_size,
+        backbone=args.backbone,
+        pretrained=False,
+        base_channels=args.base_channels,
+        dropout=args.dropout,
+    )
+    checkpoint = torch.load(args.checkpoint, map_location="cpu")
+    if "model_state_dict" in checkpoint:
+        state_dict = checkpoint["model_state_dict"]
+    else:
+        state_dict = checkpoint
+    model.load_state_dict(state_dict, strict=True)
+    model.to(device)
+    model.eval()
+    return model
+def preprocess_image(image, image_size):
+    if isinstance(image, Image.Image):
+        image = np.array(image.convert("RGB"))
+    else:
+        image = np.array(image)
+    if image.ndim == 2:
+        image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
+    if image.shape[-1] == 4:
+        image = image[..., :3]
+    original_rgb = image.copy()
+    resized = cv2.resize(
+        image,
+        (image_size, image_size),
+        interpolation=cv2.INTER_LINEAR,
+    )
+    resized = resized.astype(np.float32) / 255.0
+    mean = np.array(IMAGENET_MEAN, dtype=np.float32).reshape(1, 1, 3)
+    std = np.array(IMAGENET_STD, dtype=np.float32).reshape(1, 1, 3)
+    resized = (resized - mean) / std
+    tensor = torch.from_numpy(resized).permute(2, 0, 1).unsqueeze(0).float()
+    return tensor, original_rgb
+def overlay_mask(image_rgb, mask, alpha=0.45):
+    image_rgb = image_rgb.astype(np.uint8)
+    red = np.zeros_like(image_rgb)
+    red[..., 0] = 255
+    mask_3ch = mask[..., None]
+    overlay = image_rgb * (1 - alpha * mask_3ch) + red * (alpha * mask_3ch)
+    overlay = np.clip(overlay, 0, 255).astype(np.uint8)
+    return overlay
+def run_inference(image, threshold):
+    tensor, original_rgb = preprocess_image(
+        image=image,
+        image_size=APP_STATE["image_size"],
+    )
+    tensor = tensor.to(APP_STATE["device"])
+    with torch.no_grad():
+        logits = APP_STATE["model"](tensor)
+        probs = torch.sigmoid(logits)
+    prob_map = probs[0, 0].detach().cpu().numpy()
+    original_h, original_w = original_rgb.shape[:2]
+    prob_map = cv2.resize(
+        prob_map,
+        (original_w, original_h),
+        interpolation=cv2.INTER_LINEAR,
+    )
+    pred_mask = (prob_map >= threshold).astype(np.float32)
+    return original_rgb, prob_map, pred_mask
+def predict(image, threshold, alpha):
+    if image is None:
+        return None, None, None
+    original_rgb, prob_map, pred_mask = run_inference(image, threshold)
+    overlay = overlay_mask(original_rgb, pred_mask, alpha=alpha)
+    prob_vis = (prob_map * 255).clip(0, 255).astype(np.uint8)
+    mask_vis = (pred_mask * 255).astype(np.uint8)
+    return overlay, prob_vis, mask_vis
+def build_app():
+    css = """
+    #input_image {
+        height: 430px !important;
+    }
+    #input_image img {
+        object-fit: contain !important;
+        max-height: 430px !important;
+    }
+    #overlay_output {
+        height: 200px !important;
+    }
+    #overlay_output img {
+        object-fit: contain !important;
+        max-height: 200px !important;
+    }
+    #prob_output {
+        height: 200px !important;
+    }
+    #prob_output img {
+        object-fit: contain !important;
+        max-height: 200px !important;
+    }
+    #mask_output {
+        height: 430px !important;
+    }
+    #mask_output img {
+        object-fit: contain !important;
+        max-height: 430px !important;
+    }
+    """
+    with gr.Blocks(title="Retina Vessel Segmentation", css=css) as demo:
+        gr.Markdown("# Retina Vessel Segmentation")
+        gr.Markdown(
+            f"Model: `{APP_STATE['model_name']}` | "
+            f"Backbone: `{APP_STATE['backbone']}` | "
+            f"Image size: `{APP_STATE['image_size']}`"
+        )
+        with gr.Row(equal_height=False):
+            with gr.Column(scale=1):
+                input_image = gr.Image(
+                    type="pil",
+                    label="Input CFP Image",
+                    elem_id="input_image",
+                    height=430,
+                )
+                threshold = gr.Slider(
+                    minimum=0.05,
+                    maximum=0.95,
+                    value=0.5,
+                    step=0.05,
+                    label="Prediction Threshold",
+                )
+                alpha = gr.Slider(
+                    minimum=0.1,
+                    maximum=0.9,
+                    value=0.45,
+                    step=0.05,
+                    label="Overlay Alpha",
+                )
+                run_button = gr.Button("Segment")
+            with gr.Column(scale=1.2):
+                with gr.Row():
+                    overlay_output = gr.Image(
+                        type="numpy",
+                        label="Overlay",
+                        elem_id="overlay_output",
+                        height=200,
+                    )
+                    prob_output = gr.Image(
+                        type="numpy",
+                        label="Probability Map",
+                        elem_id="prob_output",
+                        height=200,
+                    )
+                mask_output = gr.Image(
+                    type="numpy",
+                    label="Binary Mask",
+                    elem_id="mask_output",
+                    height=430,
+                )
+        run_button.click(
+            fn=predict,
+            inputs=[input_image, threshold, alpha],
+            outputs=[overlay_output, prob_output, mask_output],
+        )
+        threshold.change(
+            fn=predict,
+            inputs=[input_image, threshold, alpha],
+            outputs=[overlay_output, prob_output, mask_output],
+        )
+        alpha.change(
+            fn=predict,
+            inputs=[input_image, threshold, alpha],
+            outputs=[overlay_output, prob_output, mask_output],
+        )
+    return demo
+def parse_args():
+    parser = argparse.ArgumentParser(description="Gradio app for retina vessel segmentation.")
+    parser.add_argument("--checkpoint", type=str, default="checkpoints/fives_resunet/best.pt")
+    parser.add_argument("--image-size", type=int, default=1024)
+    parser.add_argument("--model", type=str, default="resunet", choices=["resunet", "deeplabv3", "vit"])
+    parser.add_argument("--backbone", type=str, default="resnet50")
+    parser.add_argument("--base-channels", type=int, default=32)
+    parser.add_argument("--dropout", type=float, default=0.0)
+    parser.add_argument("--device", type=str, default="cuda")
+    parser.add_argument("--server-name", type=str, default="127.0.0.1")
+    parser.add_argument("--server-port", type=int, default=7860)
+    parser.add_argument("--share", action="store_true")
+    return parser.parse_args()
+if __name__ == "__main__":
+    args = parse_args()
+    device = args.device
+    if device == "cuda" and not torch.cuda.is_available():
+        device = "cpu"
+    checkpoint_path = Path(args.checkpoint)
+    if not checkpoint_path.exists():
+        raise FileNotFoundError(f"Checkpoint not found: {checkpoint_path}")
+    APP_STATE["device"] = torch.device(device)
+    APP_STATE["image_size"] = args.image_size
+    APP_STATE["model_name"] = args.model
+    APP_STATE["backbone"] = args.backbone
+    APP_STATE["model"] = load_model(
+        args=args,
+        device=APP_STATE["device"],
+    )
+    print(f"Loaded checkpoint: {checkpoint_path}")
+    print(f"Device: {APP_STATE['device']}")
+    print(f"Model: {APP_STATE['model_name']}")
+    print(f"Backbone: {APP_STATE['backbone']}")
+    print(f"Image size: {APP_STATE['image_size']}")
+    demo = build_app()
+    demo.launch(
+        # server_name=args.server_name,
+        # server_port=args.server_port,
+        # share=args.share,
+    )

augmentations.py ADDED Viewed

	@@ -0,0 +1,138 @@

+"""
+augmentations.py
+Simple camera-style augmentations for color fundus photography (CFP)
+classification.
+Expected input:
+    RGB NumPy image, shape (H, W, 3)
+Dependencies:
+    pip install albumentations opencv-python
+"""
+import albumentations as A
+from albumentations.pytorch import ToTensorV2
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+def get_train_transforms(
+    image_size=1024,
+    mean=IMAGENET_MEAN,
+    std=IMAGENET_STD,
+):
+    """
+    Training transforms.
+    """
+    return A.Compose([
+        A.Resize(image_size, image_size),
+        A.HorizontalFlip(p=0.5),
+        A.ShiftScaleRotate(
+            shift_limit=0.02,
+            scale_limit=0.05,
+            rotate_limit=7,
+            border_mode=0,
+            value=0,
+            p=0.3,
+        ),
+        A.RandomBrightnessContrast(
+            brightness_limit=0.15,
+            contrast_limit=0.15,
+            p=0.5,
+        ),
+        A.RandomGamma(
+            gamma_limit=(85, 115),
+            p=0.3,
+        ),
+        A.HueSaturationValue(
+            hue_shift_limit=3,
+            sat_shift_limit=10,
+            val_shift_limit=10,
+            p=0.25,
+        ),
+        A.OneOf([
+            A.GaussianBlur(blur_limit=(3, 5)),
+            A.Downscale(scale_min=0.80, scale_max=0.95),
+            A.ImageCompression(quality_lower=75, quality_upper=100),
+        ], p=0.2),
+        A.Normalize(mean=mean, std=std),
+        ToTensorV2(),
+    ])
+def get_val_transforms(
+    image_size=1024,
+    mean=IMAGENET_MEAN,
+    std=IMAGENET_STD,
+):
+    """
+    Validation/test transforms.
+    """
+    return A.Compose([
+        A.Resize(image_size, image_size),
+        A.Normalize(mean=mean, std=std),
+        ToTensorV2(),
+    ])
+# -------------------------------------------------------------------------
+# Suggested CFP augmentation parameter sets
+# -------------------------------------------------------------------------
+#
+# 1) DEFAULT / CONSERVATIVE
+# Use this as a general starting point for CFP classification tasks.
+#
+# Rationale:
+# - Simulates common camera/acquisition variability.
+# - Keeps color and image-quality perturbations mild.
+# - Good first choice when the disease signal may depend on subtle color,
+#   contrast, texture, or anatomical context.
+#
+# brightness_limit = 0.15
+# contrast_limit   = 0.15
+# gamma_limit      = (85, 115)      # approximately gamma 0.85–1.15
+# hue_shift_limit  = 3              # intentionally small for fundus color realism
+# sat_shift_limit  = 10
+# val_shift_limit  = 10
+# rotate_limit     = 7
+# shift_limit      = 0.02
+# scale_limit      = 0.05
+# blur_limit       = (3, 5)
+# downscale_range  = (0.80, 0.95)
+# jpeg_quality     = (75, 100)
+#
+#
+# 2) MORE AGGRESSIVE / DOMAIN-ROBUSTNESS
+# Use this when robustness across different CFP cameras, sites, image qualities,
+# or acquisition pipelines is more important, and confirm using external or
+# camera/site-held-out validation.
+#
+# Rationale:
+# - Simulates broader variation across CFP devices and acquisition conditions.
+# - May improve domain robustness.
+# - Higher risk of altering disease-relevant appearance, so it should be
+#   validated carefully for the target task.
+#
+# brightness_limit = 0.25
+# contrast_limit   = 0.25
+# gamma_limit      = (75, 130)      # approximately gamma 0.75–1.30
+# hue_shift_limit  = 5              # still limited for fundus color realism
+# sat_shift_limit  = 18
+# val_shift_limit  = 18
+# rotate_limit     = 12
+# shift_limit      = 0.04
+# scale_limit      = 0.10
+# blur_limit       = (3, 7)
+# downscale_range  = (0.65, 0.95)
+# jpeg_quality     = (55, 100)
+# -------------------------------------------------------------------------

checkpoints/fives_resunet/best.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4f89b779afb9de2859fa57a0282dd5e3e252fab39ab8fdcfa1cc0ce794108bbd
+size 97523253

datasets/DRIVE.py ADDED Viewed

	@@ -0,0 +1,223 @@

+from pathlib import Path
+import torch
+from torch.utils.data import Dataset, DataLoader
+from PIL import Image
+import torchvision.transforms.functional as TF
+class DRIVEDataset(Dataset):
+    """
+    PyTorch Dataset for the DRIVE retinal vessel segmentation dataset.
+    Expected structure:
+    DRIVE/
+    ├── training/
+    │   ├── images/
+    │   ├── 1st_manual/
+    │   └── mask/
+    └── test/
+        ├── images/
+        └── mask/
+    For training split:
+        image: 21_training.tif
+        vessel mask: 21_manual1.gif
+        FOV mask: 21_training_mask.gif
+    For test split:
+        image: 01_test.tif
+        FOV mask: 01_test_mask.gif
+        no vessel mask is included in the provided tree
+    """
+    def __init__(
+        self,
+        root,
+        split="training",
+        image_size=None,
+        return_fov=True,
+        transform=None,
+    ):
+        self.root = Path(root)
+        self.split = split
+        self.image_size = image_size
+        self.return_fov = return_fov
+        self.transform = transform
+        if split not in ["training", "test"]:
+            raise ValueError("split must be either 'training' or 'test'")
+        self.split_dir = self.root / split
+        self.image_dir = self.split_dir / "images"
+        self.fov_dir = self.split_dir / "mask"
+        if not self.image_dir.exists():
+            raise FileNotFoundError(f"Image directory not found: {self.image_dir}")
+        self.image_paths = sorted(self.image_dir.glob("*.tif"))
+        if len(self.image_paths) == 0:
+            raise RuntimeError(f"No .tif images found in {self.image_dir}")
+        if split == "training":
+            self.label_dir = self.split_dir / "1st_manual"
+            if not self.label_dir.exists():
+                raise FileNotFoundError(f"Label directory not found: {self.label_dir}")
+        else:
+            self.label_dir = None
+    def __len__(self):
+        return len(self.image_paths)
+    def _get_case_id(self, image_path):
+        """
+        Examples:
+            21_training.tif -> 21
+            01_test.tif -> 01
+        """
+        return image_path.stem.split("_")[0]
+    def _load_image(self, path):
+        image = Image.open(path).convert("RGB")
+        return image
+    def _load_mask(self, path):
+        mask = Image.open(path).convert("L")
+        return mask
+    def _resize_if_needed(self, image, label=None, fov=None):
+        if self.image_size is None:
+            return image, label, fov
+        size = self.image_size
+        if isinstance(size, int):
+            size = (size, size)
+        image = TF.resize(image, size, interpolation=TF.InterpolationMode.BILINEAR)
+        if label is not None:
+            label = TF.resize(label, size, interpolation=TF.InterpolationMode.NEAREST)
+        if fov is not None:
+            fov = TF.resize(fov, size, interpolation=TF.InterpolationMode.NEAREST)
+        return image, label, fov
+    def __getitem__(self, idx):
+        image_path = self.image_paths[idx]
+        case_id = self._get_case_id(image_path)
+        image = self._load_image(image_path)
+        if self.split == "training":
+            label_path = self.label_dir / f"{case_id}_manual1.gif"
+            label = self._load_mask(label_path)
+        else:
+            label = None
+        fov_path = self.fov_dir / f"{case_id}_{self.split}_mask.gif"
+        fov = self._load_mask(fov_path)
+        image, label, fov = self._resize_if_needed(image, label, fov)
+        if self.transform is not None:
+            image, label, fov = self.transform(image, label, fov)
+        image = TF.to_tensor(image)
+        sample = {
+            "image": image,
+            "case_id": case_id,
+        }
+        if label is not None:
+            label = TF.to_tensor(label)
+            label = (label > 0.5).float()
+            sample["label"] = label
+        if self.return_fov:
+            fov = TF.to_tensor(fov)
+            fov = (fov > 0.5).float()
+            sample["fov"] = fov
+        return sample
+if __name__ == "__main__":
+    import matplotlib.pyplot as plt
+    root = "/data/MIDS/datasets/retina/DRIVE"
+    dataset = DRIVEDataset(
+        root=root,
+        split="training",
+        image_size=512,
+        return_fov=True,
+    )
+    loader = DataLoader(
+        dataset,
+        batch_size=4,
+        shuffle=True,
+        num_workers=0,
+    )
+    batch = next(iter(loader))
+    print("Number of samples:", len(dataset))
+    print("Batch keys:", batch.keys())
+    print("Image shape:", batch["image"].shape)
+    if "label" in batch:
+        print("Label shape:", batch["label"].shape)
+        print("Label min/max:", batch["label"].min().item(), batch["label"].max().item())
+    if "fov" in batch:
+        print("FOV shape:", batch["fov"].shape)
+        print("FOV min/max:", batch["fov"].min().item(), batch["fov"].max().item())
+    print("Case IDs:", batch["case_id"])
+    # -------------------------
+    # Matplotlib visualization
+    # -------------------------
+    image = batch["image"][0]          # [3, H, W]
+    label = batch.get("label", None)
+    fov = batch.get("fov", None)
+    image_np = image.permute(1, 2, 0).cpu().numpy()
+    fig, axes = plt.subplots(1, 4, figsize=(16, 4))
+    axes[0].imshow(image_np)
+    axes[0].set_title("Image")
+    axes[0].axis("off")
+    if label is not None:
+        label_np = label[0, 0].cpu().numpy()
+        axes[1].imshow(label_np, cmap="gray")
+        axes[1].set_title("Vessel Label")
+        axes[1].axis("off")
+        axes[2].imshow(image_np)
+        axes[2].imshow(label_np, cmap="Reds", alpha=0.45)
+        axes[2].set_title("Image + Vessel Overlay")
+        axes[2].axis("off")
+    else:
+        axes[1].axis("off")
+        axes[2].axis("off")
+    if fov is not None:
+        fov_np = fov[0, 0].cpu().numpy()
+        axes[3].imshow(image_np)
+        axes[3].imshow(fov_np, cmap="gray", alpha=0.25)
+        axes[3].set_title("Image + FOV Overlay")
+        axes[3].axis("off")
+    else:
+        axes[3].axis("off")
+    plt.tight_layout()
+    plt.show()

datasets/FGADR.py ADDED Viewed

	@@ -0,0 +1,342 @@

+from pathlib import Path
+import numpy as np
+import torch
+from torch.utils.data import Dataset, DataLoader
+from PIL import Image
+from sklearn.model_selection import KFold
+class FGADRDataset(Dataset):
+    """
+    FGADR Seg-set dataset for diabetic retinopathy lesion segmentation.
+    Expected structure:
+        Seg-set/
+        ├── DR_Seg_Grading_Label.csv
+        ├── Original_Images/
+        ├── Microaneurysms_Masks/
+        ├── Hemohedge_Masks/
+        ├── HardExudate_Masks/
+        ├── SoftExudate_Masks/
+        ├── IRMA_Masks/
+        └── Neovascularization_Masks/
+    CSV format, no header:
+        filename,dr_grade
+    Output:
+        image: [3, H, W]
+        label: [6, H, W]
+        grade: scalar long tensor
+        case_id: filename stem
+    split:
+        "train" = all folds except selected fold
+        "val"   = selected fold
+        "all"   = full dataset
+    Notes:
+        If a lesion-specific mask file is absent, it is treated as an empty
+        all-zero mask, meaning no incidence of that lesion class.
+    """
+    lesion_dirs = {
+        "microaneurysm": "Microaneurysms_Masks",
+        "hemorrhage": "Hemohedge_Masks",
+        "hard_exudate": "HardExudate_Masks",
+        "soft_exudate": "SoftExudate_Masks",
+        "irma": "IRMA_Masks",
+        "neovascularization": "Neovascularization_Masks",
+    }
+    def __init__(
+        self,
+        root,
+        split="train",
+        fold=0,
+        n_folds=5,
+        seed=42,
+        transform=None,
+        csv_name="DR_Seg_Grading_Label.csv",
+        image_dir_name="Original_Images",
+        mask_suffix="",
+    ):
+        self.root = Path(root)
+        self.split = split
+        self.fold = fold
+        self.n_folds = n_folds
+        self.seed = seed
+        self.transform = transform
+        self.csv_path = self.root / csv_name
+        self.image_dir = self.root / image_dir_name
+        self.mask_suffix = mask_suffix
+        if split not in ["train", "val", "all"]:
+            raise ValueError("split must be one of: 'train', 'val', 'all'")
+        if not (0 <= fold < n_folds):
+            raise ValueError(f"fold must be in [0, {n_folds - 1}], got {fold}")
+        if not self.image_dir.exists():
+            raise FileNotFoundError(f"Image directory not found: {self.image_dir}")
+        if not self.csv_path.exists():
+            raise FileNotFoundError(f"CSV file not found: {self.csv_path}")
+        self.class_names = list(self.lesion_dirs.keys())
+        for dirname in self.lesion_dirs.values():
+            mask_dir = self.root / dirname
+            if not mask_dir.exists():
+                raise FileNotFoundError(f"Mask directory not found: {mask_dir}")
+        all_samples = self._read_csv()
+        if len(all_samples) == 0:
+            raise RuntimeError(f"No samples found in {self.csv_path}")
+        if split == "all":
+            self.samples = all_samples
+        else:
+            kfold = KFold(
+                n_splits=n_folds,
+                shuffle=True,
+                random_state=seed,
+            )
+            splits = list(kfold.split(all_samples))
+            train_indices, val_indices = splits[fold]
+            if split == "train":
+                self.samples = [all_samples[i] for i in train_indices]
+            else:
+                self.samples = [all_samples[i] for i in val_indices]
+    def _read_csv(self):
+        samples = []
+        with open(self.csv_path, "r") as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                parts = line.split(",")
+                if len(parts) < 2:
+                    continue
+                filename = parts[0].strip()
+                grade = int(parts[1].strip())
+                image_path = self.image_dir / filename
+                if not image_path.exists():
+                    raise FileNotFoundError(f"Image not found: {image_path}")
+                samples.append(
+                    {
+                        "filename": filename,
+                        "case_id": Path(filename).stem,
+                        "image_path": image_path,
+                        "grade": grade,
+                    }
+                )
+        return samples
+    def __len__(self):
+        return len(self.samples)
+    def _load_image(self, path):
+        image = Image.open(path).convert("RGB")
+        return np.array(image)
+    def _load_mask(self, path, shape):
+        if path.exists():
+            mask = Image.open(path).convert("L")
+            mask = np.array(mask)
+        else:
+            mask = np.zeros(shape, dtype=np.uint8)
+        return mask
+    def _get_mask_path(self, lesion_name, filename):
+        mask_dir = self.root / self.lesion_dirs[lesion_name]
+        if self.mask_suffix:
+            stem = Path(filename).stem
+            suffix = Path(filename).suffix
+            filename = f"{stem}{self.mask_suffix}{suffix}"
+        return mask_dir / filename
+    def __getitem__(self, idx):
+        sample_info = self.samples[idx]
+        filename = sample_info["filename"]
+        image_path = sample_info["image_path"]
+        case_id = sample_info["case_id"]
+        grade = sample_info["grade"]
+        image = self._load_image(image_path)
+        h, w = image.shape[:2]
+        masks = []
+        mask_paths = {}
+        for lesion_name in self.class_names:
+            mask_path = self._get_mask_path(lesion_name, filename)
+            mask = self._load_mask(mask_path, shape=(h, w))
+            masks.append(mask)
+            mask_paths[lesion_name] = str(mask_path)
+        if self.transform is not None:
+            transformed = self.transform(
+                image=image,
+                masks=masks,
+            )
+            image = transformed["image"]
+            masks = transformed["masks"]
+            masks = [
+                m.float() if isinstance(m, torch.Tensor) else torch.from_numpy(m).float()
+                for m in masks
+            ]
+            label = torch.stack(masks, dim=0)
+        else:
+            image = torch.from_numpy(image).permute(2, 0, 1).float() / 255.0
+            label = torch.stack(
+                [torch.from_numpy(m).float() for m in masks],
+                dim=0,
+            )
+        label = (label > 0).float()
+        return {
+            "image": image,
+            "label": label,
+            "grade": torch.tensor(grade, dtype=torch.long),
+            "case_id": case_id,
+            "filename": filename,
+            "image_path": str(image_path),
+            "mask_paths": mask_paths,
+        }
+if __name__ == "__main__":
+    import matplotlib.pyplot as plt
+    from tqdm import tqdm
+    try:
+        from augmentations import get_train_transforms, IMAGENET_MEAN, IMAGENET_STD
+    except ImportError:
+        import sys
+        project_root = Path(__file__).resolve().parents[1]
+        sys.path.append(str(project_root))
+        from augmentations import get_train_transforms, IMAGENET_MEAN, IMAGENET_STD
+    root = "/data/MIDS/datasets/retina/FGADR/Seg-set"
+    image_size = 512
+    dataset = FGADRDataset(
+        root=root,
+        split="train",
+        fold=0,
+        n_folds=5,
+        seed=42,
+        transform=get_train_transforms(image_size=image_size),
+    )
+    print("\nChecking all FGADR files...")
+    missing_images = 0
+    absent_masks = 0
+    for sample in tqdm(dataset.samples, desc="Checking files"):
+        filename = sample["filename"]
+        if not sample["image_path"].exists():
+            print(f"Missing image: {sample['image_path']}")
+            missing_images += 1
+        for lesion_name in dataset.class_names:
+            mask_path = dataset._get_mask_path(lesion_name, filename)
+            if not mask_path.exists():
+                absent_masks += 1
+    print("File check complete.")
+    print(f"Missing images: {missing_images}")
+    print(f"Absent lesion masks treated as empty: {absent_masks}")
+    loader = DataLoader(
+        dataset,
+        batch_size=4,
+        shuffle=True,
+        num_workers=0,
+    )
+    batch = next(iter(loader))
+    print("\nSmoke test batch:")
+    print("Number of samples:", len(dataset))
+    print("Split:", dataset.split)
+    print("Fold:", dataset.fold)
+    print("Number of folds:", dataset.n_folds)
+    print("Class names:", dataset.class_names)
+    print("Batch keys:", batch.keys())
+    print("Image shape:", batch["image"].shape)
+    print("Label shape:", batch["label"].shape)
+    print("Grade shape:", batch["grade"].shape)
+    print("Label min/max:", batch["label"].min().item(), batch["label"].max().item())
+    print("Case IDs:", batch["case_id"])
+    image = batch["image"][0].cpu()
+    label = batch["label"][0].cpu()
+    grade = batch["grade"][0].item()
+    mean = torch.tensor(IMAGENET_MEAN).view(3, 1, 1)
+    std = torch.tensor(IMAGENET_STD).view(3, 1, 1)
+    image_vis = image * std + mean
+    image_vis = image_vis.clamp(0, 1)
+    image_vis = image_vis.permute(1, 2, 0).numpy()
+    combined_mask = (label.sum(dim=0) > 0).float().numpy()
+    fig, axes = plt.subplots(2, 5, figsize=(20, 8))
+    axes = axes.flatten()
+    axes[0].imshow(image_vis)
+    axes[0].set_title(f"Image | Grade {grade}")
+    axes[0].axis("off")
+    axes[1].imshow(combined_mask, cmap="gray")
+    axes[1].set_title("Any Lesion")
+    axes[1].axis("off")
+    axes[2].imshow(image_vis)
+    axes[2].imshow(combined_mask, cmap="Reds", alpha=0.45)
+    axes[2].set_title("Overlay")
+    axes[2].axis("off")
+    for ax in axes[3:]:
+        ax.axis("off")
+    for i, class_name in enumerate(dataset.class_names):
+        ax = axes[i + 3]
+        ax.imshow(label[i].numpy(), cmap="gray")
+        ax.set_title(class_name)
+        ax.axis("off")
+    plt.tight_layout()
+    plt.show()

datasets/FIVES.py ADDED Viewed

	@@ -0,0 +1,215 @@

+from pathlib import Path
+import numpy as np
+import torch
+from torch.utils.data import Dataset, DataLoader
+from PIL import Image
+class FIVESDataset(Dataset):
+    """
+    PyTorch Dataset for FIVES retinal vessel segmentation.
+    Expected structure:
+        FIVES_dataset/
+        ├── train/
+        │   ├── Original/
+        │   └── Ground truth/
+        └── test/
+            ├── Original/
+            └── Ground truth/
+    Each image in Original/ should have a matching vessel mask
+    with the same filename in Ground truth/.
+    Output sample:
+        {
+            "image": Tensor [3, H, W],
+            "label": Tensor [1, H, W],
+            "case_id": str,
+            "image_path": str,
+            "label_path": str,
+        }
+    If transform is provided, it should be an Albumentations transform.
+    """
+    def __init__(
+        self,
+        root,
+        split="train",
+        transform=None,
+        image_dir_name="Original",
+        label_dir_name="Ground truth",
+    ):
+        self.root = Path(root)
+        self.split = split
+        self.transform = transform
+        if split not in ["train", "test"]:
+            raise ValueError("split must be either 'train' or 'test'")
+        self.split_dir = self.root / split
+        self.image_dir = self.split_dir / image_dir_name
+        self.label_dir = self.split_dir / label_dir_name
+        if not self.image_dir.exists():
+            raise FileNotFoundError(f"Image directory not found: {self.image_dir}")
+        if not self.label_dir.exists():
+            raise FileNotFoundError(f"Label directory not found: {self.label_dir}")
+        self.image_paths = sorted(
+            [
+                p for p in self.image_dir.glob("*.png")
+                if not p.name.startswith(".") and p.name.lower() != "thumbs.db"
+            ]
+        )
+        if len(self.image_paths) == 0:
+            raise RuntimeError(f"No PNG images found in {self.image_dir}")
+        self.samples = []
+        for image_path in self.image_paths:
+            label_path = self.label_dir / image_path.name
+            if not label_path.exists():
+                raise FileNotFoundError(
+                    f"Missing label for image:\n"
+                    f"image: {image_path}\n"
+                    f"label: {label_path}"
+                )
+            self.samples.append(
+                {
+                    "image_path": image_path,
+                    "label_path": label_path,
+                    "case_id": image_path.stem,
+                }
+            )
+    def __len__(self):
+        return len(self.samples)
+    def _load_image(self, path):
+        image = Image.open(path).convert("RGB")
+        return np.array(image)
+    def _load_mask(self, path):
+        mask = Image.open(path).convert("L")
+        return np.array(mask)
+    def __getitem__(self, idx):
+        sample_info = self.samples[idx]
+        image_path = sample_info["image_path"]
+        label_path = sample_info["label_path"]
+        case_id = sample_info["case_id"]
+        image = self._load_image(image_path)
+        label = self._load_mask(label_path)
+        if self.transform is not None:
+            transformed = self.transform(
+                image=image,
+                mask=label,
+            )
+            image = transformed["image"]
+            label = transformed["mask"]
+            # Albumentations ToTensorV2 converts image to [3, H, W],
+            # but mask remains [H, W], so add channel dimension.
+            if isinstance(label, torch.Tensor):
+                label = label.float().unsqueeze(0)
+            else:
+                label = torch.from_numpy(label).float().unsqueeze(0)
+        else:
+            image = torch.from_numpy(image).permute(2, 0, 1).float() / 255.0
+            label = torch.from_numpy(label).float().unsqueeze(0)
+        # Convert vessel mask to binary {0, 1}
+        label = (label > 0).float()
+        return {
+            "image": image,
+            "label": label,
+            "case_id": case_id,
+            "image_path": str(image_path),
+            "label_path": str(label_path),
+        }
+if __name__ == "__main__":
+    import matplotlib.pyplot as plt
+    try:
+        from augmentations import get_train_transforms, get_val_transforms
+    except ImportError:
+        import sys
+        project_root = Path(__file__).resolve().parents[1]
+        sys.path.append(str(project_root))
+        from augmentations import get_train_transforms, get_val_transforms
+    root = "/data/MIDS/datasets/retina/FIVES_dataset"
+    image_size = 512
+    dataset = FIVESDataset(
+        root=root,
+        split="train",
+        transform=get_train_transforms(image_size=image_size),
+    )
+    loader = DataLoader(
+        dataset,
+        batch_size=4,
+        shuffle=True,
+        num_workers=0,
+    )
+    batch = next(iter(loader))
+    print("Number of samples:", len(dataset))
+    print("Batch keys:", batch.keys())
+    print("Image shape:", batch["image"].shape)
+    print("Label shape:", batch["label"].shape)
+    print("Label min/max:", batch["label"].min().item(), batch["label"].max().item())
+    print("Case IDs:", batch["case_id"])
+    # -------------------------
+    # Matplotlib visualization
+    # -------------------------
+    image = batch["image"][0]
+    label = batch["label"][0, 0]
+    # Undo ImageNet normalization for visualization.
+    mean = torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1)
+    std = torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1)
+    image_vis = image.cpu() * std + mean
+    image_vis = image_vis.clamp(0, 1)
+    image_vis = image_vis.permute(1, 2, 0).numpy()
+    label_vis = label.cpu().numpy()
+    fig, axes = plt.subplots(1, 3, figsize=(12, 4))
+    axes[0].imshow(image_vis)
+    axes[0].set_title("Image")
+    axes[0].axis("off")
+    axes[1].imshow(label_vis, cmap="gray")
+    axes[1].set_title("Vessel Label")
+    axes[1].axis("off")
+    axes[2].imshow(image_vis)
+    axes[2].imshow(label_vis, cmap="Reds", alpha=0.45)
+    axes[2].set_title("Overlay")
+    axes[2].axis("off")
+    plt.tight_layout()
+    plt.show()

datasets/__init__.py ADDED Viewed

File without changes

losses.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class DiceLoss(nn.Module):
+    """
+    Soft Dice loss for binary segmentation.
+    Expected shapes:
+        logits:  [B, 1, H, W]
+        targets: [B, 1, H, W]
+        mask:    [B, 1, H, W], optional FOV mask
+    The model should output raw logits, not sigmoid probabilities.
+    """
+    def __init__(self, smooth=1.0):
+        super().__init__()
+        self.smooth = smooth
+    def forward(self, logits, targets, mask=None):
+        probs = torch.sigmoid(logits)
+        if mask is not None:
+            probs = probs * mask
+            targets = targets * mask
+        probs = probs.flatten(1)
+        targets = targets.flatten(1)
+        intersection = (probs * targets).sum(dim=1)
+        denominator = probs.sum(dim=1) + targets.sum(dim=1)
+        dice = (2.0 * intersection + self.smooth) / (
+            denominator + self.smooth
+        )
+        return 1.0 - dice.mean()
+class BCEDiceLoss(nn.Module):
+    """
+    BCEWithLogits + Dice loss for binary vessel segmentation.
+    The optional mask argument is intended for the DRIVE FOV mask, so that
+    background outside the retinal field of view does not dominate training.
+    """
+    def __init__(
+        self,
+        bce_weight=1.0,
+        dice_weight=1.0,
+        smooth=1.0,
+    ):
+        super().__init__()
+        self.bce_weight = bce_weight
+        self.dice_weight = dice_weight
+        self.dice = DiceLoss(smooth=smooth)
+    def forward(self, logits, targets, mask=None):
+        bce = F.binary_cross_entropy_with_logits(
+            logits,
+            targets,
+            reduction="none",
+        )
+        if mask is not None:
+            bce = bce * mask
+            bce = bce.sum() / mask.sum().clamp_min(1.0)
+        else:
+            bce = bce.mean()
+        dice = self.dice(logits, targets, mask)
+        loss = self.bce_weight * bce + self.dice_weight * dice
+        return loss
+@torch.no_grad()
+def compute_dice_score(
+    logits,
+    targets,
+    mask=None,
+    threshold=0.5,
+    eps=1e-7,
+):
+    """
+    Hard Dice score for monitoring.
+    Expected shapes:
+        logits:  [B, 1, H, W]
+        targets: [B, 1, H, W]
+        mask:    [B, 1, H, W], optional
+    """
+    probs = torch.sigmoid(logits)
+    preds = (probs > threshold).float()
+    if mask is not None:
+        preds = preds * mask
+        targets = targets * mask
+    preds = preds.flatten(1)
+    targets = targets.flatten(1)
+    intersection = (preds * targets).sum(dim=1)
+    denominator = preds.sum(dim=1) + targets.sum(dim=1)
+    dice = (2.0 * intersection + eps) / (denominator + eps)
+    return dice.mean().item()
+if __name__ == "__main__":
+    # Smoke test:
+    # python losses.py
+    logits = torch.randn(2, 1, 512, 512)
+    targets = torch.randint(0, 2, (2, 1, 512, 512)).float()
+    fov = torch.ones(2, 1, 512, 512)
+    criterion = BCEDiceLoss(
+        bce_weight=1.0,
+        dice_weight=1.0,
+    )
+    loss = criterion(logits, targets, fov)
+    dice = compute_dice_score(logits, targets, fov)
+    print("Loss:", loss.item())
+    print("Dice:", dice)
+    print("Smoke test passed.")

models/__init__.py ADDED Viewed

	@@ -0,0 +1,66 @@

+from .unet import build_resunet
+from .deeplabv3 import build_deeplabv3
+from .vit import build_vit
+def build_model(
+    model_name="resunet",
+    num_classes=1,
+    in_channels=3,
+    image_size=512,
+    backbone="resnet50",
+    pretrained=True,
+    base_channels=32,
+    dropout=0.0,
+):
+    """
+    Generic model builder.
+    model_name options:
+        resunet
+        deeplabv3
+        vit
+    backbone:
+        For deeplabv3:
+            resnet50, resnet101
+        For vit:
+            tiny, small, base, large
+            or a timm model name
+        For resunet:
+            unused
+    """
+    model_name = model_name.lower()
+    if model_name == "resunet":
+        return build_resunet(
+            in_channels=in_channels,
+            num_classes=num_classes,
+            base_channels=base_channels,
+            dropout=dropout,
+        )
+    if model_name == "deeplabv3":
+        return build_deeplabv3(
+            backbone=backbone,
+            num_classes=num_classes,
+            pretrained_backbone=pretrained,
+        )
+    if model_name == "vit":
+        return build_vit(
+            variant=backbone,
+            num_classes=num_classes,
+            pretrained=pretrained,
+            in_chans=in_channels,
+            img_size=image_size,
+            dropout=dropout,
+        )
+    raise ValueError(
+        f"Unsupported model_name: {model_name}. "
+        "Choose from: resunet, deeplabv3, vit."
+    )

models/deeplabv3.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchvision.models.segmentation import (
+    deeplabv3_resnet50,
+    deeplabv3_resnet101,
+)
+from torchvision.models.segmentation.deeplabv3 import DeepLabHead
+class DeepLabV3Wrapper(nn.Module):
+    """
+    DeepLabV3 wrapper for retinal vessel segmentation.
+    Output:
+        Raw logits [B, num_classes, H, W]
+    For binary vessel segmentation:
+        num_classes = 1
+    """
+    def __init__(
+        self,
+        backbone="resnet50",
+        num_classes=1,
+        pretrained_backbone=True,
+        aux_loss=False,
+    ):
+        super().__init__()
+        if backbone == "resnet50":
+            model = deeplabv3_resnet50(
+                weights=None,
+                weights_backbone="DEFAULT" if pretrained_backbone else None,
+                aux_loss=aux_loss,
+            )
+            in_channels = 2048
+        elif backbone == "resnet101":
+            model = deeplabv3_resnet101(
+                weights=None,
+                weights_backbone="DEFAULT" if pretrained_backbone else None,
+                aux_loss=aux_loss,
+            )
+            in_channels = 2048
+        else:
+            raise ValueError(
+                f"Unsupported backbone: {backbone}. "
+                "Choose from: 'resnet50', 'resnet101'."
+            )
+        model.classifier = DeepLabHead(
+            in_channels=in_channels,
+            num_classes=num_classes,
+        )
+        if aux_loss and model.aux_classifier is not None:
+            model.aux_classifier[-1] = nn.Conv2d(
+                model.aux_classifier[-1].in_channels,
+                num_classes,
+                kernel_size=1,
+            )
+        self.model = model
+    def forward(self, x):
+        output = self.model(x)
+        # torchvision segmentation models return dict:
+        # {"out": logits, "aux": optional aux logits}
+        return output["out"]
+def build_deeplabv3(
+    backbone="resnet50",
+    num_classes=1,
+    pretrained_backbone=True,
+    aux_loss=False,
+):
+    return DeepLabV3Wrapper(
+        backbone=backbone,
+        num_classes=num_classes,
+        pretrained_backbone=pretrained_backbone,
+        aux_loss=aux_loss,
+    )
+if __name__ == "__main__":
+    # Smoke test:
+    # python models/deeplabv3.py
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model = build_deeplabv3(
+        backbone="resnet50",
+        num_classes=1,
+        pretrained_backbone=False,
+    ).to(device)
+    x = torch.randn(2, 3, 512, 512).to(device)
+    with torch.no_grad():
+        y = model(x)
+    print("Input shape:", x.shape)
+    print("Output shape:", y.shape)
+    print("Output min/max:", y.min().item(), y.max().item())
+    assert y.shape == (2, 1, 512, 512)
+    print("Smoke test passed.")

models/unet.py ADDED Viewed

	@@ -0,0 +1,205 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class ConvBNReLU(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size=3, padding=1):
+        super().__init__()
+        self.block = nn.Sequential(
+            nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, padding=padding, bias=False),
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU(inplace=True),
+        )
+    def forward(self, x):
+        return self.block(x)
+class ResidualBlock(nn.Module):
+    """
+    Basic residual block for ResUNet.
+    If in_channels != out_channels, the shortcut uses a 1x1 conv.
+    """
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+        self.conv1 = ConvBNReLU(in_channels, out_channels)
+        self.conv2 = nn.Sequential(
+            nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1, bias=False),
+            nn.BatchNorm2d(out_channels),
+        )
+        if in_channels != out_channels:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False),
+                nn.BatchNorm2d(out_channels),
+            )
+        else:
+            self.shortcut = nn.Identity()
+        self.relu = nn.ReLU(inplace=True)
+    def forward(self, x):
+        residual = self.shortcut(x)
+        x = self.conv1(x)
+        x = self.conv2(x)
+        x = x + residual
+        x = self.relu(x)
+        return x
+class EncoderBlock(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+        self.res_block = ResidualBlock(in_channels, out_channels)
+        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
+    def forward(self, x):
+        skip = self.res_block(x)
+        pooled = self.pool(skip)
+        return skip, pooled
+class DecoderBlock(nn.Module):
+    def __init__(self, in_channels, skip_channels, out_channels):
+        super().__init__()
+        self.up = nn.ConvTranspose2d(
+            in_channels,
+            out_channels,
+            kernel_size=2,
+            stride=2,
+        )
+        self.res_block = ResidualBlock(
+            out_channels + skip_channels,
+            out_channels,
+        )
+    def forward(self, x, skip):
+        x = self.up(x)
+        # Handles odd image sizes, though 512/1024 should already match.
+        if x.shape[-2:] != skip.shape[-2:]:
+            x = F.interpolate(
+                x,
+                size=skip.shape[-2:],
+                mode="bilinear",
+                align_corners=False,
+            )
+        x = torch.cat([x, skip], dim=1)
+        x = self.res_block(x)
+        return x
+class ResUNet(nn.Module):
+    """
+    ResUNet for binary or multi-class retinal segmentation.
+    Output:
+        Raw logits of shape [B, num_classes, H, W]
+    For vessel segmentation:
+        num_classes=1
+        loss=BCEWithLogits/Dice/Tversky/etc.
+    """
+    def __init__(
+        self,
+        in_channels=3,
+        num_classes=1,
+        base_channels=32,
+        dropout=0.0,
+    ):
+        super().__init__()
+        c1 = base_channels
+        c2 = base_channels * 2
+        c3 = base_channels * 4
+        c4 = base_channels * 8
+        c5 = base_channels * 16
+        self.enc1 = EncoderBlock(in_channels, c1)
+        self.enc2 = EncoderBlock(c1, c2)
+        self.enc3 = EncoderBlock(c2, c3)
+        self.enc4 = EncoderBlock(c3, c4)
+        self.bottleneck = nn.Sequential(
+            ResidualBlock(c4, c5),
+            nn.Dropout2d(dropout),
+        )
+        self.dec4 = DecoderBlock(c5, c4, c4)
+        self.dec3 = DecoderBlock(c4, c3, c3)
+        self.dec2 = DecoderBlock(c3, c2, c2)
+        self.dec1 = DecoderBlock(c2, c1, c1)
+        self.out_conv = nn.Conv2d(c1, num_classes, kernel_size=1)
+    def forward(self, x):
+        s1, x = self.enc1(x)
+        s2, x = self.enc2(x)
+        s3, x = self.enc3(x)
+        s4, x = self.enc4(x)
+        x = self.bottleneck(x)
+        x = self.dec4(x, s4)
+        x = self.dec3(x, s3)
+        x = self.dec2(x, s2)
+        x = self.dec1(x, s1)
+        logits = self.out_conv(x)
+        return logits
+def build_resunet(
+    in_channels=3,
+    num_classes=1,
+    base_channels=32,
+    dropout=0.0,
+):
+    return ResUNet(
+        in_channels=in_channels,
+        num_classes=num_classes,
+        base_channels=base_channels,
+        dropout=dropout,
+    )
+if __name__ == "__main__":
+    # Smoke test:
+    # python models/unet.py
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model = build_resunet(
+        in_channels=3,
+        num_classes=1,
+        base_channels=32,
+        dropout=0.0,
+    ).to(device)
+    x = torch.randn(2, 3, 512, 512).to(device)
+    with torch.no_grad():
+        y = model(x)
+    print("Input shape:", x.shape)
+    print("Output shape:", y.shape)
+    print("Output min/max:", y.min().item(), y.max().item())
+    assert y.shape == (2, 1, 512, 512)
+    print("Smoke test passed.")

models/vit.py ADDED Viewed

	@@ -0,0 +1,223 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+try:
+    import timm
+except ImportError as e:
+    raise ImportError(
+        "timm is required for models/vit.py. Install with: pip install timm"
+    ) from e
+class ViTSegmentationModel(nn.Module):
+    """
+    Simple ViT segmentation model using a timm Vision Transformer backbone.
+    The model:
+        image -> ViT patch tokens -> reshape to feature map -> conv head -> upsample
+    Output:
+        logits of shape [B, num_classes, H, W]
+    For binary vessel segmentation:
+        num_classes = 1
+    For multi-class lesion segmentation:
+        num_classes = number of lesion/background classes
+    """
+    def __init__(
+        self,
+        model_name="vit_base_patch16_224",
+        num_classes=1,
+        pretrained=True,
+        in_chans=3,
+        img_size=512,
+        decoder_dim=256,
+        dropout=0.0,
+    ):
+        super().__init__()
+        self.model_name = model_name
+        self.num_classes = num_classes
+        self.img_size = img_size
+        self.backbone = timm.create_model(
+            model_name,
+            pretrained=pretrained,
+            num_classes=0,
+            global_pool="",
+            in_chans=in_chans,
+            img_size=img_size,
+        )
+        self.embed_dim = self.backbone.num_features
+        self.patch_size = self.backbone.patch_embed.patch_size
+        if isinstance(self.patch_size, tuple):
+            self.patch_size = self.patch_size[0]
+        self.decoder = nn.Sequential(
+            nn.Conv2d(self.embed_dim, decoder_dim, kernel_size=1),
+            nn.BatchNorm2d(decoder_dim),
+            nn.ReLU(inplace=True),
+            nn.Dropout2d(dropout),
+            nn.Conv2d(decoder_dim, decoder_dim, kernel_size=3, padding=1),
+            nn.BatchNorm2d(decoder_dim),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(decoder_dim, num_classes, kernel_size=1),
+        )
+    def forward_features_as_map(self, x):
+        """
+        Convert ViT patch tokens into a spatial feature map.
+        Input:
+            x: [B, C, H, W]
+        Output:
+            feature_map: [B, embed_dim, H // patch_size, W // patch_size]
+        """
+        b, _, h, w = x.shape
+        tokens = self.backbone.forward_features(x)
+        # Some timm models return a tuple/list. Usually the first item is token features.
+        if isinstance(tokens, (tuple, list)):
+            tokens = tokens[0]
+        # For standard ViT:
+        # tokens: [B, 1 + num_patches, C], where the first token is CLS.
+        if tokens.ndim == 3:
+            expected_num_patches = (h // self.patch_size) * (w // self.patch_size)
+            if tokens.shape[1] == expected_num_patches + 1:
+                tokens = tokens[:, 1:, :]  # remove CLS token
+            feature_h = h // self.patch_size
+            feature_w = w // self.patch_size
+            tokens = tokens.transpose(1, 2)
+            feature_map = tokens.reshape(b, self.embed_dim, feature_h, feature_w)
+        # Some backbones may already return [B, C, H, W].
+        elif tokens.ndim == 4:
+            feature_map = tokens
+        else:
+            raise RuntimeError(f"Unexpected ViT feature shape: {tokens.shape}")
+        return feature_map
+    def forward(self, x):
+        input_size = x.shape[-2:]
+        feature_map = self.forward_features_as_map(x)
+        logits = self.decoder(feature_map)
+        logits = F.interpolate(
+            logits,
+            size=input_size,
+            mode="bilinear",
+            align_corners=False,
+        )
+        return logits
+def build_vit(
+    variant="base",
+    num_classes=1,
+    pretrained=True,
+    in_chans=3,
+    img_size=512,
+    decoder_dim=256,
+    dropout=0.0,
+):
+    """
+    Build a timm ViT segmentation model.
+    Parameters
+    ----------
+    variant:
+        One of:
+            "tiny"
+            "small"
+            "base"
+            "large"
+        Or directly pass a timm model name, e.g.:
+            "vit_base_patch16_224"
+            "vit_small_patch16_224"
+            "vit_large_patch16_224"
+    num_classes:
+        Number of output channels.
+        Binary segmentation:
+            num_classes=1
+        Multi-class segmentation:
+            num_classes=N
+    pretrained:
+        Whether to load ImageNet-pretrained timm weights.
+    img_size:
+        Input image size. For DRIVE, 512 is a reasonable default.
+    Returns
+    -------
+    model:
+        ViTSegmentationModel
+    """
+    variants = {
+        "tiny": "vit_tiny_patch16_224",
+        "small": "vit_small_patch16_224",
+        "base": "vit_base_patch16_224",
+        "large": "vit_large_patch16_224",
+    }
+    model_name = variants.get(variant, variant)
+    model = ViTSegmentationModel(
+        model_name=model_name,
+        num_classes=num_classes,
+        pretrained=pretrained,
+        in_chans=in_chans,
+        img_size=img_size,
+        decoder_dim=decoder_dim,
+        dropout=dropout,
+    )
+    return model
+if __name__ == "__main__":
+    # Smoke test:
+    # python models/vit.py
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model = build_vit(
+        variant="base",
+        num_classes=1,
+        pretrained=False,
+        img_size=512,
+    ).to(device)
+    x = torch.randn(2, 3, 512, 512).to(device)
+    with torch.no_grad():
+        y = model(x)
+    print("Model:", model.model_name)
+    print("Input shape:", x.shape)
+    print("Output shape:", y.shape)
+    print("Output min/max:", y.min().item(), y.max().item())
+    assert y.shape == (2, 1, 512, 512)
+    print("Smoke test passed.")

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+albumentations
+gradio
+huggingface_hub
+numpy
+opencv-python
+pandas
+pillow
+pydantic
+timm
+torch
+torchvision
+torchaudio

train.py ADDED Viewed

	@@ -0,0 +1,256 @@

+import argparse
+from pathlib import Path
+from tqdm import tqdm
+import torch
+from torch.utils.data import DataLoader
+from augmentations import get_train_transforms, get_val_transforms
+from datasets.FIVES import FIVESDataset
+from models import build_model
+from losses import BCEDiceLoss, compute_dice_score
+def train_one_epoch(model, loader, optimizer, scaler, criterion, device, use_amp=True):
+    model.train()
+    running_loss = 0.0
+    running_dice = 0.0
+    pbar = tqdm(loader, desc="Train", leave=False)
+    for batch in pbar:
+        images = batch["image"].to(device)
+        labels = batch["label"].to(device)
+        optimizer.zero_grad(set_to_none=True)
+        with torch.amp.autocast("cuda", enabled=use_amp and device.type == "cuda"):
+            logits = model(images)
+            loss = criterion(logits, labels)
+        scaler.scale(loss).backward()
+        scaler.step(optimizer)
+        scaler.update()
+        dice = compute_dice_score(logits.detach(), labels)
+        running_loss += loss.item()
+        running_dice += dice
+        avg_loss = running_loss / (pbar.n + 1)
+        avg_dice = running_dice / (pbar.n + 1)
+        pbar.set_postfix(
+            loss=f"{avg_loss:.4f}",
+            dice=f"{avg_dice:.4f}",
+        )
+    return running_loss / len(loader), running_dice / len(loader)
+@torch.no_grad()
+def validate(model, loader, criterion, device, use_amp=True):
+    model.eval()
+    running_loss = 0.0
+    running_dice = 0.0
+    pbar = tqdm(loader, desc="Val", leave=False)
+    for batch in pbar:
+        images = batch["image"].to(device)
+        labels = batch["label"].to(device)
+        with torch.amp.autocast("cuda", enabled=use_amp and device.type == "cuda"):
+            logits = model(images)
+            loss = criterion(logits, labels)
+        dice = compute_dice_score(logits, labels)
+        running_loss += loss.item()
+        running_dice += dice
+        avg_loss = running_loss / (pbar.n + 1)
+        avg_dice = running_dice / (pbar.n + 1)
+        pbar.set_postfix(
+            loss=f"{avg_loss:.4f}",
+            dice=f"{avg_dice:.4f}",
+        )
+    return running_loss / len(loader), running_dice / len(loader)
+def save_checkpoint(path, model, optimizer, epoch, best_dice, args):
+    path = Path(path)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    torch.save(
+        {
+            "epoch": epoch,
+            "model_state_dict": model.state_dict(),
+            "optimizer_state_dict": optimizer.state_dict(),
+            "best_dice": best_dice,
+            "args": vars(args),
+        },
+        path,
+    )
+def main(args):
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    train_dataset = FIVESDataset(
+        root=args.data_root,
+        split="train",
+        transform=get_train_transforms(image_size=args.image_size),
+    )
+    val_dataset = FIVESDataset(
+        root=args.data_root,
+        split="test",
+        transform=get_val_transforms(image_size=args.image_size),
+    )
+    train_loader = DataLoader(
+        train_dataset,
+        batch_size=args.batch_size,
+        shuffle=True,
+        num_workers=args.num_workers,
+        pin_memory=True,
+    )
+    val_loader = DataLoader(
+        val_dataset,
+        batch_size=args.batch_size,
+        shuffle=False,
+        num_workers=args.num_workers,
+        pin_memory=True,
+    )
+    model = build_model(
+        model_name=args.model,
+        num_classes=1,
+        in_channels=3,
+        image_size=args.image_size,
+        backbone=args.backbone,
+        pretrained=not args.no_pretrained,
+        base_channels=args.base_channels,
+        dropout=args.dropout,
+    ).to(device)
+    criterion = BCEDiceLoss(
+        bce_weight=args.bce_weight,
+        dice_weight=args.dice_weight,
+    )
+    optimizer = torch.optim.AdamW(
+        model.parameters(),
+        lr=args.lr,
+        weight_decay=args.weight_decay,
+    )
+    scaler = torch.amp.GradScaler(enabled=args.amp and device.type == "cuda")
+    best_dice = -1.0
+    print(f"Device: {device}")
+    print(f"Train samples: {len(train_dataset)}")
+    print(f"Val samples: {len(val_dataset)}")
+    print(f"Image size: {args.image_size}")
+    print(f"Batch size: {args.batch_size}")
+    print(f"Pretrained: {not args.no_pretrained}")
+    for epoch in range(1, args.epochs + 1):
+        print(f"\nEpoch [{epoch:03d}/{args.epochs}]")
+        train_loss, train_dice = train_one_epoch(
+            model=model,
+            loader=train_loader,
+            optimizer=optimizer,
+            scaler=scaler,
+            criterion=criterion,
+            device=device,
+            use_amp=args.amp,
+        )
+        val_loss, val_dice = validate(
+            model=model,
+            loader=val_loader,
+            criterion=criterion,
+            device=device,
+            use_amp=args.amp,
+        )
+        print(
+            f"train_loss={train_loss:.4f} "
+            f"train_dice={train_dice:.4f} "
+            f"val_loss={val_loss:.4f} "
+            f"val_dice={val_dice:.4f}"
+        )
+        if val_dice > best_dice:
+            best_dice = val_dice
+            save_checkpoint(
+                Path(args.output_dir) / "best.pt",
+                model,
+                optimizer,
+                epoch,
+                best_dice,
+                args,
+            )
+            print(f"Saved best checkpoint: val_dice={best_dice:.4f}")
+        if epoch % args.save_every == 0:
+            save_checkpoint(
+                Path(args.output_dir) / f"epoch_{epoch:03d}.pt",
+                model,
+                optimizer,
+                epoch,
+                best_dice,
+                args,
+            )
+    save_checkpoint(
+        Path(args.output_dir) / "last.pt",
+        model,
+        optimizer,
+        args.epochs,
+        best_dice,
+        args,
+    )
+    print("Training complete.")
+    print(f"Best val Dice: {best_dice:.4f}")
+def parse_args():
+    parser = argparse.ArgumentParser(description="Train retinal vessel segmentation model on FIVES.")
+    parser.add_argument("--data-root", type=str, required=True)
+    parser.add_argument("--output-dir", type=str, default="checkpoints/fives")
+    parser.add_argument("--image-size", type=int, default=512)
+    parser.add_argument("--epochs", type=int, default=100)
+    parser.add_argument("--batch-size", type=int, default=4)
+    parser.add_argument("--num-workers", type=int, default=4)
+    parser.add_argument("--model", type=str, default="resunet", choices=["resunet", "deeplabv3", "vit"])
+    parser.add_argument("--backbone", type=str, default="resnet50")
+    parser.add_argument("--base-channels", type=int, default=32)
+    parser.add_argument("--dropout", type=float, default=0.0)
+    parser.add_argument("--no-pretrained", action="store_true")
+    parser.add_argument("--lr", type=float, default=1e-4)
+    parser.add_argument("--weight-decay", type=float, default=1e-4)
+    parser.add_argument("--bce-weight", type=float, default=1.0)
+    parser.add_argument("--dice-weight", type=float, default=1.0)
+    parser.add_argument("--save-every", type=int, default=25)
+    parser.add_argument("--amp", action="store_true")
+    return parser.parse_args()
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)