Upload MALUNet CVC-ClinicDB weights

Browse files

Files changed (5) hide show

README.md +71 -0
best.pth +3 -0
infer.py +159 -0
models/__init__.py +0 -0
models/malunet.py +317 -0

README.md ADDED Viewed

	@@ -0,0 +1,71 @@

+---
+license: apache-2.0
+tags:
+  - image-segmentation
+  - medical-imaging
+  - polyp-segmentation
+  - pytorch
+  - malunet
+datasets:
+  - cvc-clinicdb
+library_name: pytorch
+pipeline_tag: image-segmentation
+---
+# MALUNet · CVC-ClinicDB (Polyp Segmentation)
+Lightweight U-shape segmentation network adapted from
+[jcruan519/MALUNet](https://github.com/jcruan519/MALUNet) and trained on
+[CVC-ClinicDB](https://www.kaggle.com/datasets/balraj98/cvcclinicdb) for
+binary polyp segmentation in colonoscopy frames.
+## Model
+- Architecture: MALUNet (DGA + IEA + CAB + SAB)
+- Channels: `[8, 16, 24, 32, 48, 64]`, `split_att="fc"`, `bridge=True`
+- Input: RGB, 256×256
+- Output: single-channel sigmoid mask (1 = polyp)
+- Parameters: ~0.18 M
+## Training
+- Dataset: CVC-ClinicDB (612 paired image/mask frames)
+- Split: 80% train / 20% val (seeded by filename, `seed=42`)
+- Loss: BCE + Dice
+- Optimizer: AdamW, `lr=1e-3`, `weight_decay=1e-2`
+- Schedule: CosineAnnealingLR, `T_max=50`, `eta_min=1e-5`
+- Augmentations: random h/v flip, random rotation
+- Epochs: 150
+## Usage
+```python
+import torch
+from huggingface_hub import hf_hub_download
+from infer import load_model, predict_mask  # infer.py from this repo
+from PIL import Image
+model = load_model("YOUR_USERNAME/malunet-cvc")
+mask = predict_mask(model, Image.open("polyp.png"))
+Image.fromarray(mask).save("mask.png")
+```
+`infer.py` and `models/malunet.py` are bundled in this repo so you can
+also clone it and run inference without the original training code.
+## Limitations
+- Trained on CVC-ClinicDB only (612 frames, single source). Generalization
+  to other colonoscopy systems / patient populations is unverified.
+- Not a medical device. Research / demo use only.
+## Citation
+```bibtex
+@inproceedings{ruan2023malunet,
+  title={MALUNet: A multi-attention and light-weight UNet for skin lesion segmentation},
+  author={Ruan, Jiacheng and Xie, Mingye and Xiang, Suncheng and Liu, Ting and Fu, Yongtao},
+  booktitle={2022 IEEE International Conference on Bioinformatics and Biomedicine (BIBM)},
+  year={2022}
+}
+```

best.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5966e588253cb8c8d4119c10a40fb4ebc60c3cf87fe4d04f4409d03fd271848a
+size 790195

infer.py ADDED Viewed

	@@ -0,0 +1,159 @@

+"""Standalone inference helpers for MALUNet on CVC-ClinicDB.
+`load_model` accepts either a local checkpoint path or an "<owner>/<repo>"
+reference to a Hugging Face model repository (it downloads `best.pth`).
+CLI:
+  python infer.py --weights ./best.pth --image polyp.png --out mask.png
+  python infer.py --weights jane-l/malunet-cvc --image polyp.png --out mask.png
+"""
+import argparse
+import io
+import os
+from pathlib import Path
+from typing import Tuple, Union
+import numpy as np
+import torch
+from PIL import Image
+from models.malunet import MALUNet
+DEFAULT_MODEL_CONFIG = {
+    "num_classes": 1,
+    "input_channels": 3,
+    "c_list": [8, 16, 24, 32, 48, 64],
+    "split_att": "fc",
+    "bridge": True,
+}
+INPUT_SIZE = 256
+NORM_MEAN = 109.0
+NORM_STD = 75.0
+def _build():
+    return MALUNet(
+        num_classes=DEFAULT_MODEL_CONFIG["num_classes"],
+        input_channels=DEFAULT_MODEL_CONFIG["input_channels"],
+        c_list=DEFAULT_MODEL_CONFIG["c_list"],
+        split_att=DEFAULT_MODEL_CONFIG["split_att"],
+        bridge=DEFAULT_MODEL_CONFIG["bridge"],
+    )
+def _is_hf_repo_id(s: str) -> bool:
+    if os.path.exists(s):
+        return False
+    return "/" in s and not s.endswith(".pth") and not s.endswith(".pt")
+def _strip_module_prefix(state_dict):
+    return {k[7:] if k.startswith("module.") else k: v for k, v in state_dict.items()}
+def load_model(weights: str, device: Union[str, torch.device, None] = None) -> torch.nn.Module:
+    if device is None:
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    elif isinstance(device, str):
+        device = torch.device(device)
+    if _is_hf_repo_id(weights):
+        from huggingface_hub import hf_hub_download
+        weights = hf_hub_download(repo_id=weights, filename="best.pth")
+    state = torch.load(weights, map_location="cpu")
+    if isinstance(state, dict) and "model_state_dict" in state:
+        state = state["model_state_dict"]
+    state = _strip_module_prefix(state)
+    model = _build()
+    model.load_state_dict(state, strict=True)
+    model.to(device).eval()
+    return model
+def _preprocess(img: Image.Image) -> Tuple[torch.Tensor, Tuple[int, int]]:
+    """RGB PIL image -> normalized (1,3,H,W) tensor. Returns the original (H,W)."""
+    img = img.convert("RGB")
+    orig_size = img.size[::-1]  # (H, W)
+    arr = np.asarray(img, dtype=np.float32)
+    arr = (arr - NORM_MEAN) / NORM_STD
+    lo, hi = arr.min(), arr.max()
+    if hi > lo:
+        arr = (arr - lo) / (hi - lo) * 255.0
+    else:
+        arr = np.zeros_like(arr)
+    img_resized = Image.fromarray(arr.astype(np.uint8)).resize(
+        (INPUT_SIZE, INPUT_SIZE), Image.BILINEAR
+    )
+    t = torch.from_numpy(np.asarray(img_resized, dtype=np.float32)).permute(2, 0, 1).unsqueeze(0)
+    return t, orig_size
+@torch.no_grad()
+def predict_mask(
+    model: torch.nn.Module,
+    image: Union[str, Path, Image.Image, bytes],
+    threshold: float = 0.5,
+    return_prob: bool = False,
+) -> np.ndarray:
+    """Returns a uint8 mask resized back to the original image resolution."""
+    if isinstance(image, (str, Path)):
+        img = Image.open(image)
+    elif isinstance(image, bytes):
+        img = Image.open(io.BytesIO(image))
+    elif isinstance(image, Image.Image):
+        img = image
+    else:
+        raise TypeError(f"unsupported image type: {type(image)}")
+    device = next(model.parameters()).device
+    t, (h, w) = _preprocess(img)
+    t = t.to(device).float()
+    out = model(t)  # (1,1,256,256), already sigmoid
+    prob = out[0, 0].cpu().numpy()
+    prob_full = np.array(
+        Image.fromarray((prob * 255).astype(np.uint8)).resize((w, h), Image.BILINEAR),
+        dtype=np.float32,
+    ) / 255.0
+    if return_prob:
+        return prob_full
+    return (prob_full >= threshold).astype(np.uint8) * 255
+def overlay(image: Image.Image, mask: np.ndarray, alpha: float = 0.45) -> Image.Image:
+    base = image.convert("RGB")
+    bw, bh = base.size
+    if mask.shape != (bh, bw):
+        mask = np.array(Image.fromarray(mask).resize((bw, bh), Image.NEAREST))
+    color = np.zeros((bh, bw, 3), dtype=np.uint8)
+    color[..., 0] = mask  # red
+    base_arr = np.asarray(base, dtype=np.float32)
+    mask_bool = mask > 0
+    blended = base_arr.copy()
+    blended[mask_bool] = (1 - alpha) * base_arr[mask_bool] + alpha * color[mask_bool]
+    return Image.fromarray(blended.astype(np.uint8))
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--weights", required=True, help="Local .pth path OR <owner>/<repo> on HF")
+    ap.add_argument("--image", required=True)
+    ap.add_argument("--out", default="mask.png")
+    ap.add_argument("--overlay-out", default=None, help="optional overlay PNG path")
+    ap.add_argument("--threshold", type=float, default=0.5)
+    args = ap.parse_args()
+    model = load_model(args.weights)
+    img = Image.open(args.image)
+    mask = predict_mask(model, img, threshold=args.threshold)
+    Image.fromarray(mask).save(args.out)
+    print(f"wrote {args.out}")
+    if args.overlay_out:
+        overlay(img, mask).save(args.overlay_out)
+        print(f"wrote {args.overlay_out}")
+if __name__ == "__main__":
+    main()

models/__init__.py ADDED Viewed

File without changes

models/malunet.py ADDED Viewed

	@@ -0,0 +1,317 @@

+import torch
+from torch import nn
+import torch.nn.functional as F
+from timm.models.layers import trunc_normal_
+import math
+class DepthWiseConv2d(nn.Module):
+    def __init__(self, dim_in, dim_out, kernel_size=3, padding=1, stride=1, dilation=1):
+        super().__init__()
+        self.conv1 = nn.Conv2d(dim_in, dim_in, kernel_size=kernel_size, padding=padding,
+                      stride=stride, dilation=dilation, groups=dim_in)
+        self.norm_layer = nn.GroupNorm(4, dim_in)
+        self.conv2 = nn.Conv2d(dim_in, dim_out, kernel_size=1)
+    def forward(self, x):
+        return self.conv2(self.norm_layer(self.conv1(x)))
+class GatedAttentionUnit(nn.Module):
+    def __init__(self, in_c, out_c, kernel_size):
+        super().__init__()
+        self.w1 = nn.Sequential(
+            DepthWiseConv2d(in_c, in_c, kernel_size, padding=kernel_size//2),
+            nn.Sigmoid()
+        )
+        self.w2 = nn.Sequential(
+            DepthWiseConv2d(in_c, in_c, kernel_size + 2, padding=(kernel_size + 2)//2),
+            nn.GELU()
+        )
+        self.wo = nn.Sequential(
+            DepthWiseConv2d(in_c, out_c, kernel_size),
+            nn.GELU()
+        )
+        self.cw = nn.Conv2d(in_c, out_c, 1)
+    def forward(self, x):
+        x1, x2 = self.w1(x), self.w2(x)
+        out = self.wo(x1 * x2) + self.cw(x)
+        return out
+class DilatedGatedAttention(nn.Module):
+    def __init__(self, in_c, out_c, k_size=3, dilated_ratio=[7, 5, 2, 1]):
+        super().__init__()
+        self.mda0 = nn.Conv2d(in_c//4, in_c//4, kernel_size=k_size, stride=1,
+                              padding=(k_size+(k_size-1)*(dilated_ratio[0]-1))//2,
+                             dilation=dilated_ratio[0], groups=in_c//4)
+        self.mda1 = nn.Conv2d(in_c//4, in_c//4, kernel_size=k_size, stride=1,
+                              padding=(k_size+(k_size-1)*(dilated_ratio[1]-1))//2,
+                             dilation=dilated_ratio[1], groups=in_c//4)
+        self.mda2 = nn.Conv2d(in_c//4, in_c//4, kernel_size=k_size, stride=1,
+                              padding=(k_size+(k_size-1)*(dilated_ratio[2]-1))//2,
+                             dilation=dilated_ratio[2], groups=in_c//4)
+        self.mda3 = nn.Conv2d(in_c//4, in_c//4, kernel_size=k_size, stride=1,
+                              padding=(k_size+(k_size-1)*(dilated_ratio[3]-1))//2,
+                             dilation=dilated_ratio[3], groups=in_c//4)
+        self.norm_layer = nn.GroupNorm(4, in_c)
+        self.conv = nn.Conv2d(in_c, in_c, 1)
+        self.gau = GatedAttentionUnit(in_c, out_c, 3)
+    def forward(self, x):
+        x = torch.chunk(x, 4, dim=1)
+        x0 = self.mda0(x[0])
+        x1 = self.mda1(x[1])
+        x2 = self.mda2(x[2])
+        x3 = self.mda3(x[3])
+        x = F.gelu(self.conv(self.norm_layer(torch.cat((x0, x1, x2, x3), dim=1))))
+        x = self.gau(x)
+        return x
+class EAblock(nn.Module):
+    def __init__(self, in_c):
+        super().__init__()
+        self.conv1 = nn.Conv2d(in_c, in_c, 1)
+        self.k = in_c * 4
+        self.linear_0 = nn.Conv1d(in_c, self.k, 1, bias=False)
+        self.linear_1 = nn.Conv1d(self.k, in_c, 1, bias=False)
+        self.linear_1.weight.data = self.linear_0.weight.data.permute(1, 0, 2)
+        self.conv2 = nn.Conv2d(in_c, in_c, 1, bias=False)
+        self.norm_layer = nn.GroupNorm(4, in_c)
+    def forward(self, x):
+        idn = x
+        x = self.conv1(x)
+        b, c, h, w = x.size()
+        x = x.view(b, c, h*w)   # b * c * n
+        attn = self.linear_0(x) # b, k, n
+        attn = F.softmax(attn, dim=-1) # b, k, n
+        attn = attn / (1e-9 + attn.sum(dim=1, keepdim=True)) #  # b, k, n
+        x = self.linear_1(attn) # b, c, n
+        x = x.view(b, c, h, w)
+        x = self.norm_layer(self.conv2(x))
+        x = x + idn
+        x = F.gelu(x)
+        return x
+class Channel_Att_Bridge(nn.Module):
+    def __init__(self, c_list, split_att='fc'):
+        super().__init__()
+        c_list_sum = sum(c_list) - c_list[-1]
+        self.split_att = split_att
+        self.avgpool = nn.AdaptiveAvgPool2d(1)
+        self.get_all_att = nn.Conv1d(1, 1, kernel_size=3, padding=1, bias=False)
+        self.att1 = nn.Linear(c_list_sum, c_list[0]) if split_att == 'fc' else nn.Conv1d(c_list_sum, c_list[0], 1)
+        self.att2 = nn.Linear(c_list_sum, c_list[1]) if split_att == 'fc' else nn.Conv1d(c_list_sum, c_list[1], 1)
+        self.att3 = nn.Linear(c_list_sum, c_list[2]) if split_att == 'fc' else nn.Conv1d(c_list_sum, c_list[2], 1)
+        self.att4 = nn.Linear(c_list_sum, c_list[3]) if split_att == 'fc' else nn.Conv1d(c_list_sum, c_list[3], 1)
+        self.att5 = nn.Linear(c_list_sum, c_list[4]) if split_att == 'fc' else nn.Conv1d(c_list_sum, c_list[4], 1)
+        self.sigmoid = nn.Sigmoid()
+    def forward(self, t1, t2, t3, t4, t5):
+        att = torch.cat((self.avgpool(t1),
+                         self.avgpool(t2),
+                         self.avgpool(t3),
+                         self.avgpool(t4),
+                         self.avgpool(t5)), dim=1)
+        att = self.get_all_att(att.squeeze(-1).transpose(-1, -2))
+        if self.split_att != 'fc':
+            att = att.transpose(-1, -2)
+        att1 = self.sigmoid(self.att1(att))
+        att2 = self.sigmoid(self.att2(att))
+        att3 = self.sigmoid(self.att3(att))
+        att4 = self.sigmoid(self.att4(att))
+        att5 = self.sigmoid(self.att5(att))
+        if self.split_att == 'fc':
+            att1 = att1.transpose(-1, -2).unsqueeze(-1).expand_as(t1)
+            att2 = att2.transpose(-1, -2).unsqueeze(-1).expand_as(t2)
+            att3 = att3.transpose(-1, -2).unsqueeze(-1).expand_as(t3)
+            att4 = att4.transpose(-1, -2).unsqueeze(-1).expand_as(t4)
+            att5 = att5.transpose(-1, -2).unsqueeze(-1).expand_as(t5)
+        else:
+            att1 = att1.unsqueeze(-1).expand_as(t1)
+            att2 = att2.unsqueeze(-1).expand_as(t2)
+            att3 = att3.unsqueeze(-1).expand_as(t3)
+            att4 = att4.unsqueeze(-1).expand_as(t4)
+            att5 = att5.unsqueeze(-1).expand_as(t5)
+        return att1, att2, att3, att4, att5
+class Spatial_Att_Bridge(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.shared_conv2d = nn.Sequential(nn.Conv2d(2, 1, 7, stride=1, padding=9, dilation=3),
+                                          nn.Sigmoid())
+    def forward(self, t1, t2, t3, t4, t5):
+        t_list = [t1, t2, t3, t4, t5]
+        att_list = []
+        for t in t_list:
+            avg_out = torch.mean(t, dim=1, keepdim=True)
+            max_out, _ = torch.max(t, dim=1, keepdim=True)
+            att = torch.cat([avg_out, max_out], dim=1)
+            att = self.shared_conv2d(att)
+            att_list.append(att)
+        return att_list[0], att_list[1], att_list[2], att_list[3], att_list[4]
+class SC_Att_Bridge(nn.Module):
+    def __init__(self, c_list, split_att='fc'):
+        super().__init__()
+        self.catt = Channel_Att_Bridge(c_list, split_att=split_att)
+        self.satt = Spatial_Att_Bridge()
+    def forward(self, t1, t2, t3, t4, t5):
+        r1, r2, r3, r4, r5 = t1, t2, t3, t4, t5
+        satt1, satt2, satt3, satt4, satt5 = self.satt(t1, t2, t3, t4, t5)
+        t1, t2, t3, t4, t5 = satt1 * t1, satt2 * t2, satt3 * t3, satt4 * t4, satt5 * t5
+        r1_, r2_, r3_, r4_, r5_ = t1, t2, t3, t4, t5
+        t1, t2, t3, t4, t5 = t1 + r1, t2 + r2, t3 + r3, t4 + r4, t5 + r5
+        catt1, catt2, catt3, catt4, catt5 = self.catt(t1, t2, t3, t4, t5)
+        t1, t2, t3, t4, t5 = catt1 * t1, catt2 * t2, catt3 * t3, catt4 * t4, catt5 * t5
+        return t1 + r1_, t2 + r2_, t3 + r3_, t4 + r4_, t5 + r5_
+class MALUNet(nn.Module):
+    def __init__(self, num_classes=1, input_channels=3, c_list=[8,16,24,32,48,64],
+                split_att='fc', bridge=True):
+        super().__init__()
+        self.bridge = bridge
+        self.encoder1 = nn.Sequential(
+            nn.Conv2d(input_channels, c_list[0], 3, stride=1, padding=1),
+        )
+        self.encoder2 =nn.Sequential(
+            nn.Conv2d(c_list[0], c_list[1], 3, stride=1, padding=1),
+        )
+        self.encoder3 = nn.Sequential(
+            nn.Conv2d(c_list[1], c_list[2], 3, stride=1, padding=1),
+        )
+        self.encoder4 = nn.Sequential(
+            EAblock(c_list[2]),
+            DilatedGatedAttention(c_list[2], c_list[3]),
+        )
+        self.encoder5 = nn.Sequential(
+            EAblock(c_list[3]),
+            DilatedGatedAttention(c_list[3], c_list[4]),
+        )
+        self.encoder6 = nn.Sequential(
+            EAblock(c_list[4]),
+            DilatedGatedAttention(c_list[4], c_list[5]),
+        )
+        if bridge:
+            self.scab = SC_Att_Bridge(c_list, split_att)
+            print('SC_Att_Bridge was used')
+        self.decoder1 = nn.Sequential(
+            DilatedGatedAttention(c_list[5], c_list[4]),
+            EAblock(c_list[4]),
+        )
+        self.decoder2 = nn.Sequential(
+            DilatedGatedAttention(c_list[4], c_list[3]),
+            EAblock(c_list[3]),
+        )
+        self.decoder3 = nn.Sequential(
+            DilatedGatedAttention(c_list[3], c_list[2]),
+            EAblock(c_list[2]),
+        )
+        self.decoder4 = nn.Sequential(
+            nn.Conv2d(c_list[2], c_list[1], 3, stride=1, padding=1),
+        )
+        self.decoder5 = nn.Sequential(
+            nn.Conv2d(c_list[1], c_list[0], 3, stride=1, padding=1),
+        )
+        self.ebn1 = nn.GroupNorm(4, c_list[0])
+        self.ebn2 = nn.GroupNorm(4, c_list[1])
+        self.ebn3 = nn.GroupNorm(4, c_list[2])
+        self.ebn4 = nn.GroupNorm(4, c_list[3])
+        self.ebn5 = nn.GroupNorm(4, c_list[4])
+        self.dbn1 = nn.GroupNorm(4, c_list[4])
+        self.dbn2 = nn.GroupNorm(4, c_list[3])
+        self.dbn3 = nn.GroupNorm(4, c_list[2])
+        self.dbn4 = nn.GroupNorm(4, c_list[1])
+        self.dbn5 = nn.GroupNorm(4, c_list[0])
+        self.final = nn.Conv2d(c_list[0], num_classes, kernel_size=1)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.Conv1d):
+                n = m.kernel_size[0] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+    def forward(self, x):
+        out = F.gelu(F.max_pool2d(self.ebn1(self.encoder1(x)),2,2))
+        t1 = out # b, c0, H/2, W/2
+        out = F.gelu(F.max_pool2d(self.ebn2(self.encoder2(out)),2,2))
+        t2 = out # b, c1, H/4, W/4
+        out = F.gelu(F.max_pool2d(self.ebn3(self.encoder3(out)),2,2))
+        t3 = out # b, c2, H/8, W/8
+        out = F.gelu(F.max_pool2d(self.ebn4(self.encoder4(out)),2,2))
+        t4 = out # b, c3, H/16, W/16
+        out = F.gelu(F.max_pool2d(self.ebn5(self.encoder5(out)),2,2))
+        t5 = out # b, c4, H/32, W/32
+        if self.bridge: t1, t2, t3, t4, t5 = self.scab(t1, t2, t3, t4, t5)
+        out = F.gelu(self.encoder6(out)) # b, c5, H/32, W/32
+        out5 = F.gelu(self.dbn1(self.decoder1(out))) # b, c4, H/32, W/32
+        out5 = torch.add(out5, t5) # b, c4, H/32, W/32
+        out4 = F.gelu(F.interpolate(self.dbn2(self.decoder2(out5)),scale_factor=(2,2),mode ='bilinear',align_corners=True)) # b, c3, H/16, W/16
+        out4 = torch.add(out4, t4) # b, c3, H/16, W/16
+        out3 = F.gelu(F.interpolate(self.dbn3(self.decoder3(out4)),scale_factor=(2,2),mode ='bilinear',align_corners=True)) # b, c2, H/8, W/8
+        out3 = torch.add(out3, t3) # b, c2, H/8, W/8
+        out2 = F.gelu(F.interpolate(self.dbn4(self.decoder4(out3)),scale_factor=(2,2),mode ='bilinear',align_corners=True)) # b, c1, H/4, W/4
+        out2 = torch.add(out2, t2) # b, c1, H/4, W/4
+        out1 = F.gelu(F.interpolate(self.dbn5(self.decoder5(out2)),scale_factor=(2,2),mode ='bilinear',align_corners=True)) # b, c0, H/2, W/2
+        out1 = torch.add(out1, t1) # b, c0, H/2, W/2
+        out0 = F.interpolate(self.final(out1),scale_factor=(2,2),mode ='bilinear',align_corners=True) # b, num_class, H, W
+        return torch.sigmoid(out0)