ReaganWZY commited on 1 day ago

Commit

5acc7ae

verified ·

1 Parent(s): b7b2f5c

Upload DepthPolyp model artifacts

Browse files

Files changed (27) hide show

.gitattributes +6 -34
DepthPolyp_Kvasir.onnx +3 -0
DepthPolyp_Kvasir.pth +3 -0
LICENSE +21 -0
README.md +168 -0
assets/depthpolyp_architecture.png +3 -0
assets/seq19.gif +3 -0
assets/seq22.gif +3 -0
config.json +20 -0
model/depthpolyp.py +220 -0
model/modules/DGG.py +20 -0
model/modules/GFM.py +28 -0
model/modules/HF_Decoder.py +114 -0
model/modules/ISF.py +78 -0
model/modules/MiT_Encoder.py +517 -0
model/modules/Seg_Head.py +63 -0
requirements.txt +6 -0
samples/kvasir/images/sample_01.jpg +0 -0
samples/kvasir/images/sample_02.jpg +3 -0
samples/kvasir/outputs/depth/sample_01.png +0 -0
samples/kvasir/outputs/depth/sample_02.png +3 -0
samples/kvasir/outputs/masks/sample_01.png +0 -0
samples/kvasir/outputs/masks/sample_02.png +0 -0
samples/kvasir/outputs/overlay/sample_01.jpg +0 -0
samples/kvasir/outputs/overlay/sample_02.jpg +3 -0
scripts/export_onnx.py +64 -0
scripts/infer_onnx.py +112 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,7 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

 *.pth filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.gif filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.jpeg filter=lfs diff=lfs merge=lfs -text

DepthPolyp_Kvasir.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:883ff8a825a5f51f59d46a9a2c2e9f0a505519140495dfa6800e6b48297c9f5b
+size 14588196

DepthPolyp_Kvasir.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2bfad787ccc259b25bb28ee77ec39c4ae4a579aba971facc3e579aa8debd6257
+size 14410152

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2026 ZHUOYU WU
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,168 @@

+---
+license: mit
+library_name: pytorch
+pipeline_tag: image-segmentation
+tags:
+- medical-image-segmentation
+- image-segmentation
+- semantic-segmentation
+- polyp-segmentation
+- colonoscopy
+- depth-estimation
+- pseudo-depth
+- real-time
+- onnx
+- pytorch
+- arxiv:2605.16519
+metrics:
+- dice
+- iou
+- recall
+---
+# DepthPolyp: Pseudo-Depth Guided Lightweight Segmentation for Real-Time Colonoscopy
+DepthPolyp is a lightweight pseudo-depth guided model for real-time colonoscopic polyp segmentation. Given an RGB colonoscopy frame, it jointly predicts:
+1. a binary polyp segmentation probability map
+2. a pseudo-depth probability map for depth-aware structural guidance
+The model uses a MiT-B0 encoder and lightweight fusion/gating modules to keep deployment cost low while improving robustness under blur, illumination changes, reflections, and other real-world colonoscopy degradations.
+- Paper: [arXiv:2605.16519](https://arxiv.org/abs/2605.16519)
+- Code: [github.com/ReaganWu/DepthPolyp](https://github.com/ReaganWu/DepthPolyp)
+- License: MIT
+## Model Details
+| Item | Value |
+| --- | --- |
+| Model | DepthPolyp |
+| Encoder | MiT-B0 |
+| Input | RGB image, 224 x 224 |
+| Outputs | segmentation, pseudo-depth |
+| Parameters | 3.57M |
+| Complexity | 0.86 GMACs |
+| Training data | Kvasir-SEG with degradation-aware training |
+| PyTorch checkpoint | `DepthPolyp_Kvasir.pth` |
+| ONNX checkpoint | `DepthPolyp_Kvasir.onnx` |
+ONNX I/O names:
+```text
+input: image
+outputs: segmentation, depth
+```
+## Intended Use
+DepthPolyp is intended for research on colonoscopic polyp segmentation, lightweight medical image segmentation, robustness under endoscopic video degradation, and deployment-oriented model comparison.
+This model is not a standalone medical device and is not intended for clinical diagnosis without appropriate validation, regulatory review, and expert oversight.
+## Quick Start: ONNX Runtime
+```bash
+pip install onnxruntime pillow numpy
+python scripts/infer_onnx.py \
+  --onnx DepthPolyp_Kvasir.onnx \
+  --input samples/kvasir/images \
+  --output outputs
+```
+The script writes binary masks, pseudo-depth visualizations, and mask overlays.
+## Quick Start: PyTorch
+```bash
+pip install torch torchvision pillow numpy
+```
+```python
+import torch
+from PIL import Image
+from torchvision import transforms
+from model.depthpolyp import build_depthpolyp
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model = build_depthpolyp(
+    encoder_name="b0",
+    in_channels=3,
+    num_classes=2,
+    decoder_channels=256,
+    activation=None,
+)
+state_dict = torch.load("DepthPolyp_Kvasir.pth", map_location="cpu", weights_only=True)
+model.load_state_dict(state_dict, strict=True)
+model.to(device).eval()
+image = Image.open("samples/kvasir/images/sample_01.jpg").convert("RGB")
+transform = transforms.Compose([
+    transforms.Resize((224, 224)),
+    transforms.ToTensor(),
+])
+x = transform(image).unsqueeze(0).to(device)
+with torch.no_grad():
+    seg_prob, depth_prob = model(x)
+print(seg_prob.shape)    # [1, 1, 224, 224]
+print(depth_prob.shape)  # [1, 1, 224, 224]
+```
+## Loading Files with `huggingface_hub`
+```python
+from huggingface_hub import hf_hub_download
+repo_id = "ReaganWZY/DepthPolyp"
+pth_path = hf_hub_download(repo_id=repo_id, filename="DepthPolyp_Kvasir.pth")
+onnx_path = hf_hub_download(repo_id=repo_id, filename="DepthPolyp_Kvasir.onnx")
+```
+If you publish under a different Hugging Face repo id, replace `ReaganWZY/DepthPolyp` with that id.
+## Evaluation
+Paper-reported reference results:
+| Protocol | Kvasir Dice/IoU/Recall | ClinicDB Dice/IoU/Recall | ColonDB Dice/IoU/Recall |
+| --- | --- | --- | --- |
+| `N->C` | 0.891 / 0.805 / 0.885 | 0.854 / 0.748 / 0.845 | 0.801 / 0.669 / 0.759 |
+| `N->N` | 0.853 / 0.745 / 0.854 | 0.751 / 0.608 / 0.759 | 0.734 / 0.582 / 0.697 |
+Real-world robustness and deployment results from the paper:
+| Params | GMACs | Avg. Dice | PolypGen Dice | iPhone FPS | Raspberry Pi 4 FPS |
+| ---: | ---: | ---: | ---: | ---: | ---: |
+| 3.57M | 0.86 | 0.779 | 0.679 | 181.54 | 4.05 |
+## Training Data and Protocol
+The released checkpoint is trained on Kvasir-SEG with degradation-aware training. Pseudo-depth targets are generated with Depth-Anything v2 Small and are used only during training; depth targets are not required at inference time.
+Reference training settings from the paper:
+- Input resolution: 224 x 224
+- Optimizer: AdamW
+- Learning rate: 1e-4
+- Weight decay: 1e-4
+- Batch size: 16
+- Epochs: 200
+- Schedule: 10% warm-up followed by cosine annealing
+## Citation
+```bibtex
+@misc{wu2026depthpolyp,
+  title={DepthPolyp: Pseudo-Depth Guided Lightweight Segmentation for Real-Time Colonoscopy},
+  author={Wu, Zhuoyu and Ou, Wenhui and Zhang, Lexi and Tan, Pei-Sze and Wu, Dongjun and Zhao, Junhe and Fang, Wenqi and Phan, Raphaël C.-W.},
+  year={2026},
+  eprint={2605.16519},
+  archivePrefix={arXiv},
+  primaryClass={cs.CV}
+}
+```

assets/depthpolyp_architecture.png ADDED Viewed

Git LFS Details

SHA256: 164e1f204f551b849e0d30f9633840d899df5382a5686050ece67228763d10d6
Pointer size: 131 Bytes
Size of remote file: 683 kB

assets/seq19.gif ADDED Viewed

Git LFS Details

SHA256: b5bf1f06e43007f48de9c17f455f370be4b11a8e41aacda55f9f12e60146087a
Pointer size: 132 Bytes
Size of remote file: 4.04 MB

assets/seq22.gif ADDED Viewed

Git LFS Details

SHA256: af9cb8212c7a8a33e782d4c75678c85831309eb78743b7afa479bf991f7eebc3
Pointer size: 132 Bytes
Size of remote file: 3.4 MB

config.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "architectures": [
+    "DepthPolyp"
+  ],
+  "model_type": "depthpolyp",
+  "encoder_name": "b0",
+  "in_channels": 3,
+  "num_classes": 2,
+  "decoder_channels": 256,
+  "activation": null,
+  "image_size": 224,
+  "outputs": [
+    "segmentation",
+    "depth"
+  ],
+  "training_dataset": "Kvasir-SEG",
+  "paper": "https://arxiv.org/abs/2605.16519",
+  "github": "https://github.com/ReaganWu/DepthPolyp"
+}

model/depthpolyp.py ADDED Viewed

	@@ -0,0 +1,220 @@

+import torch
+import torch.nn as nn
+from .modules.HF_Decoder import HiF_Decoder
+from .modules.MiT_Encoder import MixVisionTransformer
+from .modules.Seg_Head import SegmentationHead
+class DepthPolyp(nn.Module):
+    def __init__(
+        self,
+        in_channels: int = 3,
+        num_classes: int = 2, # 1 for seg, 1 for depth
+        encoder_name: str = 'b0',
+        decoder_channels: int = 256,
+        activation: str = None,
+        upsampling: int = 4,
+    ):
+        super().__init__()
+        # Encoder configurations
+        encoder_configs = {
+            'b0': {
+                'embed_dims': [32, 64, 160, 256],
+                'num_heads': [1, 2, 5, 8],
+                'mlp_ratios': [4, 4, 4, 4],
+                'depths': [2, 2, 2, 2],
+                'sr_ratios': [8, 4, 2, 1],
+            },
+            'b1': {
+                'embed_dims': [64, 128, 320, 512],
+                'num_heads': [1, 2, 5, 8],
+                'mlp_ratios': [4, 4, 4, 4],
+                'depths': [2, 2, 2, 2],
+                'sr_ratios': [8, 4, 2, 1],
+            },
+            'b2': {
+                'embed_dims': [64, 128, 320, 512],
+                'num_heads': [1, 2, 5, 8],
+                'mlp_ratios': [4, 4, 4, 4],
+                'depths': [3, 4, 6, 3],
+                'sr_ratios': [8, 4, 2, 1],
+            },
+            'b3': {
+                'embed_dims': [64, 128, 320, 512],
+                'num_heads': [1, 2, 5, 8],
+                'mlp_ratios': [4, 4, 4, 4],
+                'depths': [3, 4, 18, 3],
+                'sr_ratios': [8, 4, 2, 1],
+            },
+            'b4': {
+                'embed_dims': [64, 128, 320, 512],
+                'num_heads': [1, 2, 5, 8],
+                'mlp_ratios': [4, 4, 4, 4],
+                'depths': [3, 8, 27, 3],
+                'sr_ratios': [8, 4, 2, 1],
+            },
+            'b5': {
+                'embed_dims': [64, 128, 320, 512],
+                'num_heads': [1, 2, 5, 8],
+                'mlp_ratios': [4, 4, 4, 4],
+                'depths': [3, 6, 40, 3],
+                'sr_ratios': [8, 4, 2, 1],
+            },
+        }
+        if encoder_name not in encoder_configs:
+            raise ValueError(f"encoder_name should be one of {list(encoder_configs.keys())}, got {encoder_name}")
+        config = encoder_configs[encoder_name]
+        # Build encoder
+        self.encoder = MixVisionTransformer(
+            in_chans=in_channels,
+            embed_dims=config['embed_dims'],
+            num_heads=config['num_heads'],
+            mlp_ratios=config['mlp_ratios'],
+            qkv_bias=True,
+            depths=config['depths'],
+            sr_ratios=config['sr_ratios'],
+            drop_rate=0.0,
+            drop_path_rate=0.1,
+        )
+        self.decoder = HiF_Decoder(
+            encoder_channels=config['embed_dims'],
+            decoder_channels=decoder_channels,
+        )
+        # Build segmentation head (nn.Sequential style)
+        self.segmentation_head = SegmentationHead(
+            in_channels=decoder_channels//4,
+            out_channels=num_classes,
+            activation=activation,
+            kernel_size=1,
+            upsampling=upsampling,
+        )
+        self.name = f"DepthPolyp-{encoder_name}"
+    def forward(self, x):
+        """Forward pass
+        Args:
+            x: Input tensor of shape (B, C, H, W)
+        Returns:
+            Output tensor of shape (B, num_classes, H, W)
+        """
+        # Encoder - returns features at [H/4, H/8, H/16, H/32]
+        encoder_features = self.encoder(x)
+        # Decoder - returns features at H/4
+        fpn_features = self.decoder(encoder_features)
+        decoder_output = fpn_features
+        # print(f"Decoder output shape: {decoder_output.shape}")
+        # Segmentation head - upsample to original size
+        masks = self.segmentation_head(decoder_output)
+        pred_seg = torch.sigmoid(masks[:, 0:1, :, :])   # segmentation 通道
+        pred_depth = torch.sigmoid(masks[:, 1:2, :, :])                # depth 通道，通常是回归，不做激活
+        return pred_seg, pred_depth
+    @torch.no_grad()
+    def predict(self, x):
+        """Inference method"""
+        if self.training:
+            self.eval()
+        return self(x)
+    def load_pretrained(self, checkpoint_path, strict=True):
+        """Load pretrained weights
+        Args:
+            checkpoint_path: Path to checkpoint file
+            strict: Whether to strictly enforce key matching
+        """
+        state_dict = torch.load(checkpoint_path, map_location='cpu')
+        # Handle different checkpoint formats
+        if 'state_dict' in state_dict:
+            state_dict = state_dict['state_dict']
+        elif 'model' in state_dict:
+            state_dict = state_dict['model']
+        # Remove module. prefix if present (from DataParallel)
+        new_state_dict = {}
+        for k, v in state_dict.items():
+            if k.startswith('module.'):
+                new_state_dict[k[7:]] = v
+            else:
+                new_state_dict[k] = v
+        self.load_state_dict(new_state_dict, strict=strict)
+        print(f"✓ Loaded pretrained weights from {checkpoint_path}")
+def build_depthpolyp(
+    encoder_name='b0',
+    in_channels=3,
+    num_classes=2,
+    decoder_channels=256,
+    activation=None,
+):
+    """
+    Create a DepthPolyp model
+    Args:
+        encoder_name: Encoder variant ('b0', 'b1', 'b2', 'b3', 'b4', 'b5')
+        in_channels: Number of input channels
+        num_classes: Number of output classes
+        decoder_channels: Number of channels in decoder
+        activation: Output activation ('sigmoid', 'softmax', or None)
+    Returns:
+        DepthPolyp model
+    Example:
+        >>> model = build_depthpolyp('b2', num_classes=21, activation='softmax')
+        >>> print(model)
+    """
+    model = DepthPolyp(
+        in_channels=in_channels,
+        num_classes=num_classes,
+        encoder_name=encoder_name,
+        decoder_channels=decoder_channels,
+        activation=activation,
+    )
+    return model
+if __name__ == '__main__':
+    print("="*60)
+    print("Loading Model .....")
+    model = build_depthpolyp(
+        encoder_name='b0',
+        in_channels=3, # Input channels
+        num_classes=2, # Total 2. 1 for seg, 1 for depth
+        decoder_channels=256,
+        activation='sigmoid',
+    )
+    print("="*60)
+    print("Validating Model .....")
+    print("Check the Param and Complexity(GMACs)")
+    import ptflops
+    macs, params = ptflops.get_model_complexity_info(
+        model, (3, 224, 224), as_strings=True,
+        print_per_layer_stat=False, verbose=False
+    )
+    print(f"   MACs: {macs}, Params: {params}")
+    # output is MACs: 862.17 MMac, Params: 3.57 M
+    print("="*60)
+    print("Check the output .....")
+    dummy_input = torch.randn(1, 3, 224, 224) # B, C, H, W, single RGB image
+    output_seg, output_depth = model(dummy_input)
+    print("input_shape is:", dummy_input.shape)
+    print("output_seg shape is:", output_seg.shape)
+    print("output_depth shape is:", output_depth.shape)

model/modules/DGG.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import torch
+import torch.nn as nn
+class DGG_Module(nn.Module):
+    def __init__(self, channels, groups):
+        super().__init__()
+        self.groups = groups
+        self.fc = nn.Linear(groups, groups)
+    def forward(self, x):
+        B, C, H, W = x.shape
+        gc = C // self.groups
+        xg = x.view(B, self.groups, gc, H, W).mean(dim=(2,3,4))  # (B, groups)
+        gates = torch.sigmoid(self.fc(xg))[:, :, None, None, None]  # (B, groups, 1, 1, 1)
+        xg = x.view(B, self.groups, gc, H, W)
+        out = (xg * gates).reshape(B, C, H, W)
+        return out

model/modules/GFM.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import torch.nn as nn
+class GFM_Module(nn.Module):
+    def __init__(self, in_channels, out_channels, ratio=2):
+        super().__init__()
+        init_channels = out_channels // ratio
+        new_channels = out_channels - init_channels
+        self.primary_conv = nn.Sequential(
+            nn.Conv2d(in_channels, init_channels, 1, bias=False),
+            nn.BatchNorm2d(init_channels),
+            nn.ReLU(inplace=True)
+        )
+        self.cheap_operation = nn.Sequential(
+            nn.Conv2d(init_channels, new_channels, 3, 1, 1, groups=init_channels, bias=False),
+            nn.BatchNorm2d(new_channels),
+            nn.ReLU(inplace=True)
+        )
+    def forward(self, x):
+        # print("input:", x.shape)
+        x1 = self.primary_conv(x)
+        # print("primary conv output:", x1.shape)
+        x2 = self.cheap_operation(x1)
+        # print("cheap operation output:", x2.shape)
+        return x1, x2

model/modules/HF_Decoder.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .GFM import GFM_Module
+from .DGG import DGG_Module
+from .ISF import ISF_Module
+class MLP(nn.Module):
+    """Simple MLP for decoder"""
+    def __init__(self, input_dim, embed_dim):
+        super().__init__()
+        self.proj = nn.Linear(input_dim, embed_dim)
+    def forward(self, x):
+        x = x.flatten(2).transpose(1, 2)
+        x = self.proj(x)
+        return x
+class HiF_Decoder(nn.Module):
+    """Hierarchical Factorized Decoder"""
+    def __init__(
+        self,
+        encoder_channels=[64, 128, 320, 512],
+        decoder_channels=256,
+    ):
+        super().__init__()
+        # MLP layers to unify channel dimensions
+        self.linear_c4 = MLP(input_dim=encoder_channels[3], embed_dim=decoder_channels)
+        self.linear_c3 = MLP(input_dim=encoder_channels[2], embed_dim=decoder_channels)
+        self.linear_c2 = MLP(input_dim=encoder_channels[1], embed_dim=decoder_channels)
+        self.linear_c1 = MLP(input_dim=encoder_channels[0], embed_dim=decoder_channels)
+        self.dropout = nn.Dropout2d(0.1)
+        self.gfm_c4_1 = GFM_Module(decoder_channels, decoder_channels//2)
+        self.gfm_c3_1 = GFM_Module(decoder_channels, decoder_channels//2)
+        self.gfm_c2_1 = GFM_Module(decoder_channels, decoder_channels//2)
+        self.gfm_c1_1 = GFM_Module(decoder_channels, decoder_channels//2)
+        self.gfm_c_o_1 = GFM_Module(decoder_channels, decoder_channels//2)
+        self.gfm_c_e_1 = GFM_Module(decoder_channels, decoder_channels//2)
+        self.gfm_c_o_2 = GFM_Module(decoder_channels//2, decoder_channels//4)
+        self.gfm_c_e_2 = GFM_Module(decoder_channels//2, decoder_channels//4)
+        self.gfm_c_o_3 = GFM_Module(decoder_channels//4, decoder_channels//8)
+        self.gfm_c_e_3 = GFM_Module(decoder_channels//4, decoder_channels//8)
+        self.cyclic_shuffle_enhancer_o = ISF_Module(channels=decoder_channels, groups=4, kernel_size=3, cyclic_percent=0.0)
+        self.cyclic_shuffle_enhancer_e = ISF_Module(channels=decoder_channels, groups=4, kernel_size=3, cyclic_percent=0.0)
+        self.gatefuser = DGG_Module(channels=decoder_channels//4, groups=4)
+    def forward(self, encoder_features):
+        # Encoder features: [c1, c2, c3, c4] with shapes [H/4, H/8, H/16, H/32]
+        c1, c2, c3, c4 = encoder_features
+        # Get target size (H/4, W/4) - same as c1
+        n, _, h, w = c1.shape
+        # Transform each feature and upsample to H/4
+        _c4 = self.linear_c4(c4).permute(0, 2, 1).reshape(n, -1, c4.shape[2], c4.shape[3])
+        _c4 = F.interpolate(_c4, size=(h, w), mode='bilinear', align_corners=False)
+        _c3 = self.linear_c3(c3).permute(0, 2, 1).reshape(n, -1, c3.shape[2], c3.shape[3])
+        _c3 = F.interpolate(_c3, size=(h, w), mode='bilinear', align_corners=False)
+        _c2 = self.linear_c2(c2).permute(0, 2, 1).reshape(n, -1, c2.shape[2], c2.shape[3])
+        _c2 = F.interpolate(_c2, size=(h, w), mode='bilinear', align_corners=False)
+        _c1 = self.linear_c1(c1).permute(0, 2, 1).reshape(n, -1, c1.shape[2], c1.shape[3])
+        # c1 is already at the target size, no need to interpolate
+        # Concatenate and fuse
+        # print(_c4.shape, _c3.shape, _c2.shape, _c1.shape)
+        # First Stage Ghost
+        # 4*256=1024 -> 8*64=512
+        _c4_g1_o, _c4_g2_e = self.gfm_c4_1(_c4)
+        _c3_g1_o, _c3_g2_e = self.gfm_c3_1(_c3)
+        _c2_g1_o, _c2_g2_e = self.gfm_c2_1(_c2)
+        _c1_g1_o, _c1_g2_e = self.gfm_c1_1(_c1)
+        # 2*4*64 -> 2*256=512 -> 4*64=256
+        _c_o_1 = torch.cat([_c4_g1_o, _c3_g1_o, _c2_g1_o, _c1_g1_o], dim=1) # B, 256, H, W
+        _c_e_1 = torch.cat([_c4_g2_e, _c3_g2_e, _c2_g2_e, _c1_g2_e], dim=1) # B, 256, H, W
+        _c_o_1_f = self.cyclic_shuffle_enhancer_o(_c_o_1) # fused _c_o_1 feature
+        _c_e_1_f = self.cyclic_shuffle_enhancer_e(_c_e_1) # fused _c_e_1 feature
+        _c_o_1_o, _c_o_1_e = self.gfm_c_o_1(_c_o_1_f)
+        _c_e_1_o, _c_e_1_e = self.gfm_c_e_1(_c_e_1_f)
+        # Second Stage Ghost
+        # 2*2*64=256 -> 2*128 -> 4*32=128
+        _c_o_2 = torch.cat([_c_o_1_o, _c_e_1_o], dim=1)   # (B, 128, H, W)
+        _c_e_2 = torch.cat([_c_o_1_e, _c_e_1_e], dim=1)   # (B, 128, H, W)
+        _c_o_2_o, _c_o_2_e = self.gfm_c_o_2(_c_o_2)    # (B, 32 H, W), (B, 32, H, W)
+        _c_e_2_o, _c_e_2_e = self.gfm_c_e_2(_c_e_2)    # (B, 32 H, W), (B, 32, H, W)
+        # Third Stage Ghost
+        # 2*2*32=128 -> 2*64 -> 4*16=64
+        _c_o_3 = torch.cat([_c_o_2_o, _c_e_2_o], dim=1)   # (B, 64, H, W)
+        _c_e_3 = torch.cat([_c_o_2_e, _c_e_2_e], dim=1)   # (B, 64, H, W)
+        _c_o_3_o, _c_o_3_e = self.gfm_c_o_3(_c_o_3)    # (B, 16 H, W), (B, 16, H, W)
+        _c_e_3_o, _c_e_3_e = self.gfm_c_e_3(_c_e_3)    # (B, 16 H, W), (B, 16, H, W)
+        x = torch.cat([_c_o_3_o, _c_e_3_o, _c_o_3_e, _c_e_3_e], dim=1) # (B, 64, H, W)
+        x_f = self.gatefuser(x)
+        x = x + x_f
+        x = self.dropout(x)
+        return x

model/modules/ISF.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import torch
+import torch.nn as nn
+class GroupChannelShuffle(nn.Module):
+    """
+    group-based channel shuffle / interleave.
+    groups: number of source groups you want to interleave (e.g. 4 for c1..c4)
+    optional cyclic shift (percent) to add deterministic rotation after shuffle.
+    """
+    def __init__(self, groups: int = 4, cyclic_percent: float = 0.0):
+        super().__init__()
+        assert groups >= 1
+        self.groups = groups
+        self.cyclic_percent = cyclic_percent
+    def forward(self, x):
+        # x: (B, C, H, W)
+        B, C, H, W = x.shape
+        g = self.groups
+        assert C % g == 0, f"channels {C} not divisible by groups {g}"
+        gc = C // g
+        # reshape to (B, groups, group_channels, H, W)
+        x = x.view(B, g, gc, H, W)
+        # transpose to interleave: (B, group_channels, groups, H, W)
+        x = x.transpose(1, 2).contiguous()
+        x = x.view(B, C, H, W)
+        # optional cyclic rotate by percent of channels (deterministic)
+        if self.cyclic_percent and 0 < self.cyclic_percent < 1.0:
+            shift = int(C * self.cyclic_percent)
+            x = torch.roll(x, shifts=shift, dims=1)
+        return x
+class ISF_Module(nn.Module):
+    """
+    A lightweight module that wraps shuffle + depthwise conv + group-wise scaling + residual.
+    - channels: total channels of x
+    - groups: number of logical groups (must divide channels)
+    """
+    def __init__(self, channels: int, groups: int = 4, kernel_size: int = 3, cyclic_percent: float = 0.0):
+        super().__init__()
+        assert channels % groups == 0
+        self.groups = groups
+        self.channels = channels
+        self.shuffle = GroupChannelShuffle(groups=groups, cyclic_percent=cyclic_percent)
+        # depthwise conv (per-channel local spatial enhancement)
+        self.dw = nn.Conv2d(channels, channels, kernel_size=kernel_size, padding=kernel_size//2, groups=channels, bias=False)
+        self.bn = nn.BatchNorm2d(channels)
+        self.act = nn.ReLU(inplace=True)
+        # group-wise scaling: one scalar per group to reweight groups after fusion
+        self.group_scale = nn.Parameter(torch.ones(groups), requires_grad=True)  # tiny param overhead
+        # optional small pointwise to re-calibrate channels (commented out to keep ultra-light)
+        # self.pw = nn.Conv2d(channels, channels, kernel_size=1, bias=False)
+    def forward(self, x):
+        # x: (B, C, H, W)
+        B, C, H, W = x.shape
+        # 1) deterministic interleave
+        y = self.shuffle(x)               # (B, C, H, W)
+        # 2) per-channel spatial refine
+        y = self.dw(y)
+        y = self.bn(y)
+        y = self.act(y)
+        # 3) group-wise scaling
+        gc = C // self.groups
+        # scale = self.group_scale.repeat_interleave(gc).view(1, C, 1, 1)  # (1, C, 1, 1)
+        scale = self.group_scale.to(x.device)
+        scale = scale.repeat_interleave(gc).view(1, C, 1, 1)
+        y = y * scale
+        # 4) residual add to preserve original information
+        out = x + y
+        return out

model/modules/MiT_Encoder.py ADDED Viewed

	@@ -0,0 +1,517 @@

+"""
+Based on NVIDIA's SegFormer code, cleaned and made independent
+"""
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from functools import partial
+from typing import Dict, Sequence, List, Optional, Union, Callable, Any
+import warnings
+# ============================================================================
+# Utility Functions
+# ============================================================================
+def _no_grad_trunc_normal_(tensor, mean, std, a, b):
+    """Truncated normal initialization (from timm)"""
+    def norm_cdf(x):
+        return (1. + math.erf(x / math.sqrt(2.))) / 2.
+    with torch.no_grad():
+        l = norm_cdf((a - mean) / std)
+        u = norm_cdf((b - mean) / std)
+        tensor.uniform_(2 * l - 1, 2 * u - 1)
+        tensor.erfinv_()
+        tensor.mul_(std * math.sqrt(2.))
+        tensor.add_(mean)
+        tensor.clamp_(min=a, max=b)
+        return tensor
+def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
+    """Truncated normal initialization"""
+    return _no_grad_trunc_normal_(tensor, mean, std, a, b)
+def to_2tuple(x):
+    """Convert input to 2-tuple"""
+    if isinstance(x, (list, tuple)):
+        return tuple(x)
+    return (x, x)
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample"""
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        if self.drop_prob == 0. or not self.training:
+            return x
+        keep_prob = 1 - self.drop_prob
+        shape = (x.shape[0],) + (1,) * (x.ndim - 1)
+        random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
+        random_tensor.floor_()
+        output = x.div(keep_prob) * random_tensor
+        return output
+# ============================================================================
+# Core Modules
+# ============================================================================
+class LayerNorm(nn.LayerNorm):
+    """LayerNorm that supports both 3D (B, N, C) and 4D (B, C, H, W) inputs"""
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if x.ndim == 4:
+            batch_size, channels, height, width = x.shape
+            x = x.view(batch_size, channels, -1).transpose(1, 2)
+            x = F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
+            x = x.transpose(1, 2).view(batch_size, channels, height, width)
+        else:
+            x = F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
+        return x
+class DWConv(nn.Module):
+    """Depthwise Convolution"""
+    def __init__(self, dim=768):
+        super(DWConv, self).__init__()
+        self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim)
+    def forward(self, x: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        batch_size, _, channels = x.shape
+        x = x.transpose(1, 2).view(batch_size, channels, height, width)
+        x = self.dwconv(x)
+        x = x.flatten(2).transpose(1, 2)
+        return x
+class Mlp(nn.Module):
+    """MLP with depthwise convolution"""
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        drop=0.0,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.dwconv = DWConv(hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+    def forward(self, x: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        x = self.fc1(x)
+        x = self.dwconv(x, height, width)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class Attention(nn.Module):
+    """Efficient Multi-head Self-Attention with Spatial Reduction"""
+    def __init__(
+        self,
+        dim,
+        num_heads=8,
+        qkv_bias=False,
+        qk_scale=None,
+        attn_drop=0.0,
+        proj_drop=0.0,
+        sr_ratio=1,
+    ):
+        super().__init__()
+        assert dim % num_heads == 0, (
+            f"dim {dim} should be divided by num_heads {num_heads}."
+        )
+        self.dim = dim
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+        self.q = nn.Linear(dim, dim, bias=qkv_bias)
+        self.kv = nn.Linear(dim, dim * 2, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.sr_ratio = sr_ratio
+        if sr_ratio > 1:
+            self.sr = nn.Conv2d(dim, dim, kernel_size=sr_ratio, stride=sr_ratio)
+            self.norm = LayerNorm(dim)
+        else:
+            self.sr = nn.Identity()
+            self.norm = nn.Identity()
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+    def forward(self, x: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        batch_size, N, C = x.shape
+        q = (
+            self.q(x)
+            .reshape(batch_size, N, self.num_heads, C // self.num_heads)
+            .permute(0, 2, 1, 3)
+        )
+        if self.sr_ratio > 1:
+            x_ = x.permute(0, 2, 1).reshape(batch_size, C, height, width)
+            x_ = self.sr(x_).reshape(batch_size, C, -1).permute(0, 2, 1)
+            x_ = self.norm(x_)
+            kv = (
+                self.kv(x_)
+                .reshape(batch_size, -1, 2, self.num_heads, C // self.num_heads)
+                .permute(2, 0, 3, 1, 4)
+            )
+        else:
+            kv = (
+                self.kv(x)
+                .reshape(batch_size, -1, 2, self.num_heads, C // self.num_heads)
+                .permute(2, 0, 3, 1, 4)
+            )
+        k, v = kv[0], kv[1]
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(batch_size, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class Block(nn.Module):
+    """Transformer Block"""
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        mlp_ratio=4.0,
+        qkv_bias=False,
+        qk_scale=None,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        act_layer=nn.GELU,
+        norm_layer=LayerNorm,
+        sr_ratio=1,
+    ):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            sr_ratio=sr_ratio,
+        )
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+        )
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        batch_size, _, height, width = x.shape
+        x = x.flatten(2).transpose(1, 2)
+        x = x + self.drop_path(self.attn(self.norm1(x), height, width))
+        x = x + self.drop_path(self.mlp(self.norm2(x), height, width))
+        x = x.transpose(1, 2).view(batch_size, -1, height, width)
+        return x
+class OverlapPatchEmbed(nn.Module):
+    """Image to Patch Embedding with Overlapping Patches"""
+    def __init__(self, img_size=224, patch_size=7, stride=4, in_chans=3, embed_dim=768):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.H, self.W = img_size[0] // patch_size[0], img_size[1] // patch_size[1]
+        self.num_patches = self.H * self.W
+        self.proj = nn.Conv2d(
+            in_chans,
+            embed_dim,
+            kernel_size=patch_size,
+            stride=stride,
+            padding=(patch_size[0] // 2, patch_size[1] // 2),
+        )
+        self.norm = LayerNorm(embed_dim)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.proj(x)
+        x = self.norm(x)
+        return x
+# ============================================================================
+# Mix Vision Transformer (Encoder)
+# ============================================================================
+class MixVisionTransformer(nn.Module):
+    """Mix Vision Transformer - Hierarchical Transformer Encoder"""
+    def __init__(
+        self,
+        img_size=224,
+        in_chans=3,
+        embed_dims=[64, 128, 256, 512],
+        num_heads=[1, 2, 4, 8],
+        mlp_ratios=[4, 4, 4, 4],
+        qkv_bias=False,
+        qk_scale=None,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.0,
+        norm_layer=LayerNorm,
+        depths=[3, 4, 6, 3],
+        sr_ratios=[8, 4, 2, 1],
+    ):
+        super().__init__()
+        self.depths = depths
+        # Patch embeddings for each stage
+        self.patch_embed1 = OverlapPatchEmbed(
+            img_size=img_size,
+            patch_size=7,
+            stride=4,
+            in_chans=in_chans,
+            embed_dim=embed_dims[0],
+        )
+        self.patch_embed2 = OverlapPatchEmbed(
+            img_size=img_size // 4,
+            patch_size=3,
+            stride=2,
+            in_chans=embed_dims[0],
+            embed_dim=embed_dims[1],
+        )
+        self.patch_embed3 = OverlapPatchEmbed(
+            img_size=img_size // 8,
+            patch_size=3,
+            stride=2,
+            in_chans=embed_dims[1],
+            embed_dim=embed_dims[2],
+        )
+        self.patch_embed4 = OverlapPatchEmbed(
+            img_size=img_size // 16,
+            patch_size=3,
+            stride=2,
+            in_chans=embed_dims[2],
+            embed_dim=embed_dims[3],
+        )
+        # Stochastic depth decay rule
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]
+        # Transformer blocks for each stage
+        cur = 0
+        self.block1 = nn.Sequential(
+            *[
+                Block(
+                    dim=embed_dims[0],
+                    num_heads=num_heads[0],
+                    mlp_ratio=mlp_ratios[0],
+                    qkv_bias=qkv_bias,
+                    qk_scale=qk_scale,
+                    drop=drop_rate,
+                    attn_drop=attn_drop_rate,
+                    drop_path=dpr[cur + i],
+                    norm_layer=norm_layer,
+                    sr_ratio=sr_ratios[0],
+                )
+                for i in range(depths[0])
+            ]
+        )
+        self.norm1 = norm_layer(embed_dims[0])
+        cur += depths[0]
+        self.block2 = nn.Sequential(
+            *[
+                Block(
+                    dim=embed_dims[1],
+                    num_heads=num_heads[1],
+                    mlp_ratio=mlp_ratios[1],
+                    qkv_bias=qkv_bias,
+                    qk_scale=qk_scale,
+                    drop=drop_rate,
+                    attn_drop=attn_drop_rate,
+                    drop_path=dpr[cur + i],
+                    norm_layer=norm_layer,
+                    sr_ratio=sr_ratios[1],
+                )
+                for i in range(depths[1])
+            ]
+        )
+        self.norm2 = norm_layer(embed_dims[1])
+        cur += depths[1]
+        self.block3 = nn.Sequential(
+            *[
+                Block(
+                    dim=embed_dims[2],
+                    num_heads=num_heads[2],
+                    mlp_ratio=mlp_ratios[2],
+                    qkv_bias=qkv_bias,
+                    qk_scale=qk_scale,
+                    drop=drop_rate,
+                    attn_drop=attn_drop_rate,
+                    drop_path=dpr[cur + i],
+                    norm_layer=norm_layer,
+                    sr_ratio=sr_ratios[2],
+                )
+                for i in range(depths[2])
+            ]
+        )
+        self.norm3 = norm_layer(embed_dims[2])
+        cur += depths[2]
+        self.block4 = nn.Sequential(
+            *[
+                Block(
+                    dim=embed_dims[3],
+                    num_heads=num_heads[3],
+                    mlp_ratio=mlp_ratios[3],
+                    qkv_bias=qkv_bias,
+                    qk_scale=qk_scale,
+                    drop=drop_rate,
+                    attn_drop=attn_drop_rate,
+                    drop_path=dpr[cur + i],
+                    norm_layer=norm_layer,
+                    sr_ratio=sr_ratios[3],
+                )
+                for i in range(depths[3])
+            ]
+        )
+        self.norm4 = norm_layer(embed_dims[3])
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+    def forward(self, x: torch.Tensor) -> List[torch.Tensor]:
+        outs = []
+        # Stage 1: H/4, W/4
+        x = self.patch_embed1(x)
+        x = self.block1(x)
+        x = self.norm1(x).contiguous()
+        outs.append(x)
+        # Stage 2: H/8, W/8
+        x = self.patch_embed2(x)
+        x = self.block2(x)
+        x = self.norm2(x).contiguous()
+        outs.append(x)
+        # Stage 3: H/16, W/16
+        x = self.patch_embed3(x)
+        x = self.block3(x)
+        x = self.norm3(x).contiguous()
+        outs.append(x)
+        # Stage 4: H/32, W/32
+        x = self.patch_embed4(x)
+        x = self.block4(x)
+        x = self.norm4(x).contiguous()
+        outs.append(x)
+        return outs

model/modules/Seg_Head.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import torch.nn as nn
+# ============================================================================
+# Activation Module
+# ============================================================================
+class Activation(nn.Module):
+    """Activation wrapper that supports various activation functions"""
+    def __init__(self, activation=None):
+        super().__init__()
+        if activation is None or activation == 'identity':
+            self.activation = nn.Identity()
+        elif activation == 'sigmoid':
+            self.activation = nn.Sigmoid()
+        elif activation == 'softmax':
+            self.activation = nn.Softmax(dim=1)
+        elif activation == 'softmax2d':
+            self.activation = nn.Softmax(dim=1)
+        elif activation == 'logsoftmax':
+            self.activation = nn.LogSoftmax(dim=1)
+        elif activation == 'tanh':
+            self.activation = nn.Tanh()
+        elif activation == 'relu':
+            self.activation = nn.ReLU(inplace=True)
+        elif callable(activation):
+            self.activation = activation
+        else:
+            raise ValueError(
+                f'Activation should be callable/sigmoid/softmax/logsoftmax/tanh/None; got {activation}'
+            )
+    def forward(self, x):
+        return self.activation(x)
+# ============================================================================
+# Segmentation Head (nn.Sequential style)
+# ============================================================================
+class SegmentationHead(nn.Sequential):
+    """Segmentation head using nn.Sequential style"""
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size=3,
+        activation=None,
+        upsampling=1
+    ):
+        conv2d = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            padding=kernel_size // 2
+        )
+        upsampling_layer = (
+            nn.UpsamplingBilinear2d(scale_factor=upsampling)
+            if upsampling > 1
+            else nn.Identity()
+        )
+        activation_layer = Activation(activation)
+        super().__init__(conv2d, upsampling_layer, activation_layer)

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+torch
+torchvision
+numpy
+pillow
+onnxruntime

samples/kvasir/images/sample_01.jpg ADDED Viewed

samples/kvasir/images/sample_02.jpg ADDED Viewed

Git LFS Details

SHA256: e7046f72d982bf65c853e1465f8f45d7f29bca4f2d0ceac286641dc27e4ac872
Pointer size: 131 Bytes
Size of remote file: 220 kB

samples/kvasir/outputs/depth/sample_01.png ADDED Viewed

samples/kvasir/outputs/depth/sample_02.png ADDED Viewed

Git LFS Details

SHA256: d441ffac673c06779f38b811751849225f5381b97c4e0a3ecbe49c390adfbdd4
Pointer size: 131 Bytes
Size of remote file: 142 kB

samples/kvasir/outputs/masks/sample_01.png ADDED Viewed

samples/kvasir/outputs/masks/sample_02.png ADDED Viewed

samples/kvasir/outputs/overlay/sample_01.jpg ADDED Viewed

samples/kvasir/outputs/overlay/sample_02.jpg ADDED Viewed

Git LFS Details

SHA256: 28f808f52574e0fc443abcde81a1ec88fd6822b2997f111a438c629c3802a7cb
Pointer size: 131 Bytes
Size of remote file: 209 kB

scripts/export_onnx.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import argparse
+import sys
+from pathlib import Path
+import torch
+REPO_ROOT = Path(__file__).resolve().parents[1]
+sys.path.insert(0, str(REPO_ROOT))
+from model.depthpolyp import build_depthpolyp
+def load_checkpoint(path):
+    try:
+        return torch.load(path, map_location="cpu", weights_only=True)
+    except TypeError:
+        return torch.load(path, map_location="cpu")
+def parse_args():
+    parser = argparse.ArgumentParser(description="Export DepthPolyp to ONNX.")
+    parser.add_argument("--checkpoint", default="checkpoints/DepthPolyp_Kvasir.pth")
+    parser.add_argument("--output", default="checkpoints/DepthPolyp_Kvasir.onnx")
+    parser.add_argument("--image-size", type=int, default=224)
+    parser.add_argument("--opset", type=int, default=17)
+    return parser.parse_args()
+def main():
+    args = parse_args()
+    model = build_depthpolyp(
+        encoder_name="b0",
+        in_channels=3,
+        num_classes=2,
+        decoder_channels=256,
+        activation=None,
+    )
+    state_dict = load_checkpoint(args.checkpoint)
+    model.load_state_dict(state_dict, strict=True)
+    model.eval()
+    output_path = Path(args.output)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    dummy = torch.randn(1, 3, args.image_size, args.image_size)
+    torch.onnx.export(
+        model,
+        dummy,
+        output_path,
+        input_names=["image"],
+        output_names=["segmentation", "depth"],
+        opset_version=args.opset,
+        do_constant_folding=True,
+        dynamic_axes={
+            "image": {0: "batch"},
+            "segmentation": {0: "batch"},
+            "depth": {0: "batch"},
+        },
+    )
+    print(f"Exported ONNX model to {output_path}")
+if __name__ == "__main__":
+    main()

scripts/infer_onnx.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import argparse
+from pathlib import Path
+import numpy as np
+import onnxruntime as ort
+from PIL import Image
+IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".bmp", ".tif", ".tiff"}
+def parse_args():
+    parser = argparse.ArgumentParser(description="Run DepthPolyp ONNX inference on images.")
+    parser.add_argument("--onnx", default="checkpoints/DepthPolyp_Kvasir.onnx")
+    parser.add_argument("--input", default="samples/kvasir/images")
+    parser.add_argument("--output", default="samples/kvasir/outputs")
+    parser.add_argument("--image-size", type=int, default=224)
+    parser.add_argument("--threshold", type=float, default=0.3)
+    return parser.parse_args()
+def list_images(input_path: Path):
+    if input_path.is_file():
+        return [input_path]
+    return sorted(path for path in input_path.rglob("*") if path.suffix.lower() in IMAGE_EXTENSIONS)
+def preprocess(image_path: Path, image_size: int):
+    image = Image.open(image_path).convert("RGB")
+    original_size = image.size
+    resized = image.resize((image_size, image_size), Image.BILINEAR)
+    array = np.asarray(resized).astype(np.float32) / 255.0
+    tensor = np.transpose(array, (2, 0, 1))[None, ...]
+    return image, original_size, tensor
+def to_grayscale(probability: np.ndarray, size):
+    probability = np.clip(probability, 0.0, 1.0)
+    image = Image.fromarray((probability * 255).astype(np.uint8), mode="L")
+    return image.resize(size, Image.BILINEAR)
+def colorize_purple_yellow(probability: np.ndarray, size):
+    probability = np.clip(probability, 0.0, 1.0)
+    stops = np.array(
+        [
+            [38, 5, 84],
+            [86, 33, 132],
+            [141, 48, 140],
+            [203, 71, 119],
+            [245, 135, 48],
+            [252, 231, 37],
+        ],
+        dtype=np.float32,
+    )
+    scaled = probability * (len(stops) - 1)
+    lower = np.floor(scaled).astype(np.int32)
+    upper = np.clip(lower + 1, 0, len(stops) - 1)
+    alpha = (scaled - lower)[..., None]
+    colored = stops[lower] * (1.0 - alpha) + stops[upper] * alpha
+    image = Image.fromarray(colored.astype(np.uint8), mode="RGB")
+    return image.resize(size, Image.BILINEAR)
+def make_overlay(image: Image.Image, mask: Image.Image):
+    base = image.convert("RGBA")
+    mask_array = np.asarray(mask).astype(np.float32) / 255.0
+    color = np.zeros((mask_array.shape[0], mask_array.shape[1], 4), dtype=np.uint8)
+    color[..., 0] = 252
+    color[..., 1] = 231
+    color[..., 2] = 37
+    color[..., 3] = (mask_array * 155).astype(np.uint8)
+    return Image.alpha_composite(base, Image.fromarray(color, mode="RGBA")).convert("RGB")
+def main():
+    args = parse_args()
+    input_path = Path(args.input)
+    output_root = Path(args.output)
+    mask_dir = output_root / "masks"
+    depth_dir = output_root / "depth"
+    overlay_dir = output_root / "overlay"
+    for directory in (mask_dir, depth_dir, overlay_dir):
+        directory.mkdir(parents=True, exist_ok=True)
+    session = ort.InferenceSession(args.onnx, providers=["CPUExecutionProvider"])
+    input_name = session.get_inputs()[0].name
+    images = list_images(input_path)
+    if not images:
+        raise FileNotFoundError(f"No images found under {input_path}")
+    for image_path in images:
+        image, original_size, tensor = preprocess(image_path, args.image_size)
+        segmentation, depth = session.run(None, {input_name: tensor})
+        seg_prob = segmentation[0, 0]
+        depth_prob = depth[0, 0]
+        seg_image = to_grayscale(seg_prob, original_size)
+        depth_image = colorize_purple_yellow(depth_prob, original_size)
+        binary_mask = seg_image.point(lambda value: 255 if value >= int(args.threshold * 255) else 0)
+        overlay = make_overlay(image, seg_image)
+        stem = image_path.stem
+        binary_mask.save(mask_dir / f"{stem}.png")
+        depth_image.save(depth_dir / f"{stem}.png")
+        overlay.save(overlay_dir / f"{stem}.jpg", quality=95)
+    print(f"Processed {len(images)} image(s). Outputs saved to {output_root}")
+if __name__ == "__main__":
+    main()