Upload folder using huggingface_hub

Browse files

Files changed (11) hide show

__init__.py +28 -0
config.py +23 -0
datasets/__init__.py +7 -0
datasets/flair.py +140 -0
models/__init__.py +19 -0
models/heads.py +246 -0
models/isdnet.py +101 -0
models/modules.py +258 -0
utils/__init__.py +17 -0
utils/distributed.py +56 -0
weights/isdnet_flair_best.pth +3 -0

__init__.py ADDED Viewed

	@@ -0,0 +1,28 @@

+"""
+ISDNet: Integrating Shallow and Deep Networks for Efficient Ultra-high Resolution Segmentation
+A standalone PyTorch implementation.
+"""
+from .models import ISDNet
+from .datasets import FLAIRDataset
+from .config import (
+    DATA_ROOT,
+    STDC_PRETRAIN_PATH,
+    BATCH_SIZE_PER_GPU,
+    NUM_WORKERS,
+    BASE_LR,
+    WEIGHT_DECAY,
+    NUM_EPOCHS,
+    NUM_CLASSES,
+    CROP_SIZE,
+    DOWN_RATIO,
+    IGNORE_INDEX,
+    SAVE_INTERVAL,
+)
+__version__ = "1.0.0"
+__all__ = [
+    "ISDNet",
+    "FLAIRDataset",
+]

config.py ADDED Viewed

	@@ -0,0 +1,23 @@

+"""
+ISDNet Configuration
+"""
+# Data paths
+DATA_ROOT = "/ccast/FLAIR1024_optimal"
+STDC_PRETRAIN_PATH = "STDCNet813M_73.91.tar"
+# Training hyperparameters
+BATCH_SIZE_PER_GPU = 16
+NUM_WORKERS = 4
+BASE_LR = 1e-3
+WEIGHT_DECAY = 0.0005
+NUM_EPOCHS = 80
+# Model configuration
+NUM_CLASSES = 15  # Classes 0-14 only
+CROP_SIZE = 512
+DOWN_RATIO = 4
+IGNORE_INDEX = 255  # For classes >= 15
+# Checkpointing
+SAVE_INTERVAL = 5

datasets/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+"""
+ISDNet datasets
+"""
+from .flair import FLAIRDataset
+__all__ = ["FLAIRDataset"]

datasets/flair.py ADDED Viewed

	@@ -0,0 +1,140 @@

+"""
+FLAIR French Land Cover Dataset
+"""
+import os
+import numpy as np
+from PIL import Image
+import torch
+from torch.utils.data import Dataset
+class FLAIRDataset(Dataset):
+    """
+    FLAIR French Land Cover dataset.
+    15 classes (0-14), classes >= 15 are mapped to ignore_index (255).
+    Args:
+        data_root: Path to dataset root
+        split: 'train', 'valid', or 'test'
+        crop_size: Size of random/center crop
+        augment: Whether to apply augmentations (auto-disabled for non-train splits)
+        ignore_index: Label value to use for ignored classes
+    """
+    # ImageNet normalization
+    MEAN = np.array([123.675, 116.28, 103.53], dtype=np.float32)
+    STD = np.array([58.395, 57.12, 57.375], dtype=np.float32)
+    # Class names
+    CLASSES = [
+        'building', 'pervious', 'impervious', 'bare_soil', 'water',
+        'coniferous', 'deciduous', 'brushwood', 'vineyard', 'herbaceous',
+        'agricultural', 'plowed_land', 'swimming_pool', 'snow', 'greenhouse'
+    ]
+    def __init__(self, data_root, split='train', crop_size=512, augment=True, ignore_index=255):
+        self.data_root = data_root
+        self.split = split
+        self.crop_size = crop_size
+        self.augment = augment and split == 'train'
+        self.ignore_index = ignore_index
+        self.img_dir = os.path.join(data_root, split, 'img')
+        self.msk_dir = os.path.join(data_root, split, 'msk')
+        self.img_files = sorted(os.listdir(self.img_dir))
+    def __len__(self):
+        return len(self.img_files)
+    def _photometric_distortion(self, img):
+        """Apply photometric distortion (brightness, contrast, saturation, hue)."""
+        # Random brightness
+        if np.random.rand() > 0.5:
+            delta = np.random.uniform(-32, 32)
+            img = img + delta
+        # Random contrast
+        if np.random.rand() > 0.5:
+            alpha = np.random.uniform(0.5, 1.5)
+            img = img * alpha
+        # Convert to HSV for saturation and hue
+        img_uint8 = np.clip(img, 0, 255).astype(np.uint8)
+        img_hsv = np.array(Image.fromarray(img_uint8).convert('HSV')).astype(np.float32)
+        # Random saturation
+        if np.random.rand() > 0.5:
+            img_hsv[:, :, 1] = img_hsv[:, :, 1] * np.random.uniform(0.5, 1.5)
+        # Random hue
+        if np.random.rand() > 0.5:
+            img_hsv[:, :, 0] = (img_hsv[:, :, 0] + np.random.uniform(-18, 18)) % 256
+        # Convert back to RGB
+        img_hsv = np.clip(img_hsv, 0, 255).astype(np.uint8)
+        img = np.array(Image.fromarray(img_hsv, mode='HSV').convert('RGB')).astype(np.float32)
+        return np.clip(img, 0, 255)
+    def _random_rotate(self, img, msk):
+        """Random rotation by 90, 180, or 270 degrees."""
+        k = np.random.choice([0, 1, 2, 3])
+        if k > 0:
+            img = np.rot90(img, k).copy()
+            msk = np.rot90(msk, k).copy()
+        return img, msk
+    def __getitem__(self, idx):
+        img_path = os.path.join(self.img_dir, self.img_files[idx])
+        msk_path = os.path.join(self.msk_dir, self.img_files[idx].replace('_RGBI_', '_LABEL-COSIA_'))
+        img = np.array(Image.open(img_path)).astype(np.float32)[:, :, :3]
+        msk = np.array(Image.open(msk_path)).astype(np.int64)
+        # Remap classes: keep 0-14, map >=15 to ignore_index
+        msk[msk >= 15] = self.ignore_index
+        # Apply photometric distortion BEFORE normalization
+        if self.augment:
+            img = self._photometric_distortion(img)
+        # Normalize
+        img = (img - self.MEAN) / self.STD
+        # Random/center crop
+        if self.crop_size and img.shape[0] >= self.crop_size:
+            h, w = img.shape[:2]
+            if self.augment:
+                # Try to find a crop with good class coverage (cat_max_ratio logic)
+                for _ in range(10):
+                    top = np.random.randint(0, h - self.crop_size + 1)
+                    left = np.random.randint(0, w - self.crop_size + 1)
+                    crop_msk = msk[top:top+self.crop_size, left:left+self.crop_size]
+                    valid_msk = crop_msk[crop_msk != self.ignore_index]
+                    if len(valid_msk) > 0:
+                        unique, counts = np.unique(valid_msk, return_counts=True)
+                        if len(unique) > 1:
+                            max_ratio = counts.max() / counts.sum()
+                            if max_ratio < 0.75:
+                                break
+                img = img[top:top+self.crop_size, left:left+self.crop_size]
+                msk = msk[top:top+self.crop_size, left:left+self.crop_size]
+            else:
+                # Center crop for validation
+                top = (h - self.crop_size) // 2
+                left = (w - self.crop_size) // 2
+                img = img[top:top+self.crop_size, left:left+self.crop_size]
+                msk = msk[top:top+self.crop_size, left:left+self.crop_size]
+        # Random rotation
+        if self.augment and np.random.rand() > 0.5:
+            img, msk = self._random_rotate(img, msk)
+        # Random horizontal flip
+        if self.augment and np.random.rand() > 0.5:
+            img = np.fliplr(img).copy()
+            msk = np.fliplr(msk).copy()
+        return torch.from_numpy(img.transpose(2, 0, 1).astype(np.float32)), torch.from_numpy(msk)

models/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+"""
+ISDNet models
+"""
+from .isdnet import ISDNet
+from .modules import ConvX, AddBottleneck, CatBottleneck, ShallowNet, Lap_Pyramid_Conv
+from .heads import ASPPModule, ISDHead, RefineASPPHead
+__all__ = [
+    "ISDNet",
+    "ConvX",
+    "AddBottleneck",
+    "CatBottleneck",
+    "ShallowNet",
+    "Lap_Pyramid_Conv",
+    "ASPPModule",
+    "ISDHead",
+    "RefineASPPHead",
+]

models/heads.py ADDED Viewed

	@@ -0,0 +1,246 @@

+"""
+ISDNet decoder heads: ASPP, ISDHead, RefineASPPHead
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from .modules import ShallowNet, Lap_Pyramid_Conv
+from ..utils import batch_mm_loop
+class ASPPModule(nn.ModuleList):
+    """Atrous Spatial Pyramid Pooling module."""
+    def __init__(self, dilations, in_ch, ch, conv_cfg, norm_cfg, act_cfg):
+        super().__init__([
+            ConvModule(
+                in_ch, ch,
+                1 if d == 1 else 3,
+                dilation=d,
+                padding=0 if d == 1 else d,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg
+            )
+            for d in dilations
+        ])
+    def forward(self, x):
+        return [m(x) for m in self]
+class SegmentationHead(nn.Module):
+    """Simple segmentation head with conv + classifier."""
+    def __init__(self, conv_cfg, norm_cfg, act_cfg, in_ch, mid_ch, n_classes, **kw):
+        super().__init__()
+        self.conv = ConvModule(in_ch, mid_ch, 3, 1, 1,
+                              conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg)
+        self.out = nn.Conv2d(mid_ch, n_classes, 1, bias=True)
+    def forward(self, x):
+        return self.out(self.conv(x))
+class SRDecoder(nn.Module):
+    """Super-resolution decoder for feature alignment loss."""
+    def __init__(self, conv_cfg, norm_cfg, act_cfg, ch=128, up_lists=[2, 2, 2]):
+        super().__init__()
+        self.up1 = nn.Upsample(scale_factor=up_lists[0])
+        self.conv1 = ConvModule(ch, ch // 2, 3, 1, 1,
+                               conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg)
+        self.up2 = nn.Upsample(scale_factor=up_lists[1])
+        self.conv2 = ConvModule(ch // 2, ch // 2, 3, 1, 1,
+                               conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg)
+        self.up3 = nn.Upsample(scale_factor=up_lists[2])
+        self.conv3 = ConvModule(ch // 2, ch, 3, 1, 1,
+                               conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg)
+        self.conv_sr = SegmentationHead(conv_cfg, norm_cfg, act_cfg, ch, ch // 2, 3)
+    def forward(self, x, fa=False):
+        feats = self.conv3(self.up3(self.conv2(self.up2(self.conv1(self.up1(x))))))
+        if fa:
+            return feats, self.conv_sr(feats)
+        return self.conv_sr(feats)
+class ChannelAtt(nn.Module):
+    """Channel attention module."""
+    def __init__(self, in_ch, out_ch, conv_cfg, norm_cfg, act_cfg):
+        super().__init__()
+        self.conv = ConvModule(in_ch, out_ch, 3, 1, 1,
+                              conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg)
+        self.conv1x1 = ConvModule(out_ch, out_ch, 1, 1, 0,
+                                 conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=None)
+    def forward(self, x):
+        feat = self.conv(x)
+        return feat, self.conv1x1(feat.mean(dim=(2, 3), keepdim=True))
+class RelationAwareFusion(nn.Module):
+    """
+    Relation-aware fusion module.
+    Fuses shallow (spatial) and deep (context) features using
+    cross-attention mechanism.
+    """
+    def __init__(self, ch, conv_cfg, norm_cfg, act_cfg, ext=2, r=16):
+        super().__init__()
+        self.r = r
+        self.g1 = nn.Parameter(torch.zeros(1))
+        self.g2 = nn.Parameter(torch.zeros(1))
+        self.sp_mlp = nn.Sequential(
+            nn.Linear(ch * 2, ch),
+            nn.ReLU(),
+            nn.Linear(ch, ch)
+        )
+        self.sp_att = ChannelAtt(ch * ext, ch, conv_cfg, norm_cfg, act_cfg)
+        self.co_mlp = nn.Sequential(
+            nn.Linear(ch * 2, ch),
+            nn.ReLU(),
+            nn.Linear(ch, ch)
+        )
+        self.co_att = ChannelAtt(ch, ch, conv_cfg, norm_cfg, act_cfg)
+        self.co_head = ConvModule(ch, ch, 3, 1, 1,
+                                 conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg)
+        self.smooth = ConvModule(ch, ch, 3, 1, 1,
+                                conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=None)
+    def forward(self, sp_feat, co_feat):
+        s_f, s_a = self.sp_att(sp_feat)
+        c_f, c_a = self.co_att(co_feat)
+        b, c = s_a.shape[:2]
+        # Use loop-based batch mm to avoid CUBLAS strided batched issues
+        s_a_reshaped = s_a.view(b, self.r, c // self.r)
+        c_a_reshaped = c_a.view(b, self.r, c // self.r).permute(0, 2, 1)
+        aff = batch_mm_loop(s_a_reshaped, c_a_reshaped).view(b, -1)
+        re_s = torch.sigmoid(s_a + self.g1 * F.relu(self.sp_mlp(aff)).unsqueeze(-1).unsqueeze(-1))
+        re_c = torch.sigmoid(c_a + self.g2 * F.relu(self.co_mlp(aff)).unsqueeze(-1).unsqueeze(-1))
+        c_f = self.co_head(
+            F.interpolate(c_f * re_c, s_f.shape[2:], mode='bilinear', align_corners=False)
+        )
+        return s_f, c_f, self.smooth(s_f * re_s + c_f)
+class Reducer(nn.Module):
+    """Channel reducer module."""
+    def __init__(self, in_ch=512, reduce=128):
+        super().__init__()
+        self.conv = nn.Conv2d(in_ch, reduce, 1, bias=False)
+        self.bn = nn.SyncBatchNorm(reduce)
+    def forward(self, x):
+        return F.relu(self.bn(self.conv(x)))
+class ISDHead(nn.Module):
+    """
+    ISD decoder head.
+    Combines shallow STDC features with deep backbone features
+    using relation-aware fusion at multiple scales.
+    """
+    def __init__(self, in_ch, ch, num_classes, down_ratio, prev_ch,
+                 conv_cfg=None, norm_cfg=dict(type='SyncBN'), act_cfg=dict(type='ReLU'),
+                 dropout=0.1, reduce=False, stdc_pretrain=''):
+        super().__init__()
+        self.ch = ch
+        self.fuse8 = RelationAwareFusion(ch, conv_cfg, norm_cfg, act_cfg, ext=2)
+        self.fuse16 = RelationAwareFusion(ch, conv_cfg, norm_cfg, act_cfg, ext=4)
+        self.sr_dec = SRDecoder(conv_cfg, norm_cfg, act_cfg, ch, [4, 2, 2])
+        self.stdc = ShallowNet(in_channels=6, pretrain_model=stdc_pretrain)
+        self.lap = Lap_Pyramid_Conv(num_high=2)
+        self.seg_aux16 = SegmentationHead(conv_cfg, norm_cfg, act_cfg, ch, ch // 2, num_classes)
+        self.seg_aux8 = SegmentationHead(conv_cfg, norm_cfg, act_cfg, ch, ch // 2, num_classes)
+        self.seg = SegmentationHead(conv_cfg, norm_cfg, act_cfg, ch, ch // 2, num_classes)
+        self.reduce = Reducer() if reduce else None
+        self.drop = nn.Dropout2d(dropout) if dropout > 0 else None
+    def forward(self, inputs, prev_output, train_flag=True):
+        # Laplacian pyramid decomposition
+        pyr = self.lap.pyramid_decom(inputs)
+        pyr1_up = F.interpolate(pyr[1], pyr[0].shape[2:], mode='bilinear', align_corners=False)
+        high_in = torch.cat([pyr[0], pyr1_up], dim=1)
+        # Shallow features
+        s8, s16 = self.stdc(high_in)
+        # Deep features
+        deep = self.reduce(prev_output[0]) if self.reduce else prev_output[0]
+        # Multi-scale fusion
+        _, a16, f16 = self.fuse16(s16, deep)
+        _, a8, f8 = self.fuse8(s8, f16)
+        # Segmentation output
+        out = self.seg(self.drop(f8) if self.drop else f8)
+        if train_flag:
+            feats, sr_out = self.sr_dec(deep, True)
+            target = pyr[0] + pyr1_up
+            if sr_out.shape[2:] != target.shape[2:]:
+                sr_out = F.interpolate(sr_out, target.shape[2:], mode='bilinear', align_corners=False)
+            return (out,
+                    self.seg_aux16(a8),
+                    self.seg_aux8(a16),
+                    {'recon_losses': F.mse_loss(sr_out, target) * 0.1},
+                    {'fa_loss': self._fa(deep, feats)})
+        return out
+    def _fa(self, seg_f, sr_f, eps=1e-6):
+        """Feature alignment loss."""
+        if seg_f.shape[2:] != sr_f.shape[2:]:
+            sr_f = F.interpolate(sr_f, seg_f.shape[2:], mode='bilinear', align_corners=False)
+        sf = torch.flatten(seg_f, 2)
+        srf = torch.flatten(sr_f, 2)
+        sf = sf / (sf.norm(p=2, dim=2, keepdim=True) + eps)
+        srf = srf / (srf.norm(p=2, dim=2, keepdim=True) + eps)
+        # Use loop-based batch mm for CUBLAS compatibility
+        sf_t = sf.permute(0, 2, 1)
+        srf_t = srf.permute(0, 2, 1)
+        return F.l1_loss(batch_mm_loop(sf_t, sf), batch_mm_loop(srf_t, srf).detach())
+class RefineASPPHead(nn.Module):
+    """
+    ASPP-based decoder head for deep path.
+    Processes low-resolution backbone features with
+    atrous spatial pyramid pooling.
+    """
+    def __init__(self, in_ch, ch, num_classes, dilations=(1, 12, 24, 36),
+                 conv_cfg=None, norm_cfg=dict(type='SyncBN'), act_cfg=dict(type='ReLU'),
+                 dropout=0.1, in_index=-1):
+        super().__init__()
+        self.in_index = in_index
+        self.pool = nn.Sequential(
+            nn.AdaptiveAvgPool2d(1),
+            ConvModule(in_ch, ch, 1, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg)
+        )
+        self.aspp = ASPPModule(dilations, in_ch, ch, conv_cfg, norm_cfg, act_cfg)
+        self.bottle = ConvModule(
+            (len(dilations) + 1) * ch, ch, 3, padding=1,
+            conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg
+        )
+        self.seg = nn.Conv2d(ch, num_classes, 1)
+        self.drop = nn.Dropout2d(dropout) if dropout > 0 else None
+    def forward(self, inputs):
+        x = inputs[self.in_index] if isinstance(inputs, (list, tuple)) else inputs
+        outs = [F.interpolate(self.pool(x), x.shape[2:], mode='bilinear', align_corners=False)]
+        outs.extend(self.aspp(x))
+        feat = self.bottle(torch.cat(outs, dim=1))
+        return self.seg(self.drop(feat) if self.drop else feat), [feat]

models/isdnet.py ADDED Viewed

	@@ -0,0 +1,101 @@

+"""
+ISDNet: Integrating Shallow and Deep Networks for Efficient Ultra-high Resolution Segmentation
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+import timm
+from .heads import ISDHead, RefineASPPHead
+class ISDNet(nn.Module):
+    """
+    ISDNet model for ultra-high resolution segmentation.
+    Combines a deep ResNet backbone with a shallow STDC-like network
+    to efficiently process both global context and local details.
+    Args:
+        num_classes: Number of segmentation classes
+        backbone: Backbone model name (from timm)
+        ch: Base channel number for decoder
+        down_ratio: Downsampling ratio for deep path
+        dilations: ASPP dilation rates
+        pretrained: Use pretrained backbone weights
+        stdc_pretrain: Path to pretrained STDC weights
+    """
+    def __init__(self, num_classes=15, backbone='resnet18', ch=128,
+                 down_ratio=4, dilations=(1, 12, 24, 36),
+                 pretrained=True, stdc_pretrain=''):
+        super().__init__()
+        self.ds = down_ratio
+        # Backbone (deep path)
+        self.bb = timm.create_model(backbone, pretrained=pretrained, features_only=True)
+        bb_ch = self.bb.feature_info.channels()
+        print(f"Backbone channels: {bb_ch}")
+        # Deep decoder (ASPP)
+        self.dec = RefineASPPHead(bb_ch[-1], ch, num_classes, dilations, in_index=-1)
+        # Shallow decoder (ISD head)
+        self.ref = ISDHead(3, ch, num_classes, down_ratio, ch, stdc_pretrain=stdc_pretrain)
+        # Auxiliary head
+        self.aux = nn.Sequential(
+            ConvModule(bb_ch[-2], 64, 3, padding=1,
+                      norm_cfg=dict(type='SyncBN'), act_cfg=dict(type='ReLU')),
+            nn.Dropout2d(0.1),
+            nn.Conv2d(64, num_classes, 1)
+        )
+    def forward(self, img, return_loss=True):
+        """
+        Forward pass.
+        Args:
+            img: Input image tensor (B, C, H, W)
+            return_loss: If True, return dict with all outputs for loss computation
+                        If False, return only final segmentation output
+        Returns:
+            If return_loss=True: Dict with 'out', 'out_deep', 'out_aux16', 'out_aux8',
+                                 'aux_out', 'losses_re', 'losses_fa'
+            If return_loss=False: Segmentation logits (B, num_classes, H, W)
+        """
+        # Downsample for deep path
+        x = self.bb(F.interpolate(
+            img,
+            [s // self.ds for s in img.shape[2:]],
+            mode='bilinear',
+            align_corners=False
+        ))
+        # Deep path output
+        out_g, prev = self.dec(x)
+        if return_loss:
+            # Full training forward with all auxiliary outputs
+            out_r, a16, a8, l_re, l_fa = self.ref(img, prev, True)
+            sz = img.shape[2:]
+            return {
+                'out': F.interpolate(out_r, sz, mode='bilinear', align_corners=False),
+                'out_deep': F.interpolate(out_g, sz, mode='bilinear', align_corners=False),
+                'out_aux16': F.interpolate(a16, sz, mode='bilinear', align_corners=False),
+                'out_aux8': F.interpolate(a8, sz, mode='bilinear', align_corners=False),
+                'aux_out': F.interpolate(self.aux(x[-2]), sz, mode='bilinear', align_corners=False),
+                'losses_re': l_re,
+                'losses_fa': l_fa
+            }
+        # Inference: only shallow path output
+        return F.interpolate(
+            self.ref(img, prev, False),
+            img.shape[2:],
+            mode='bilinear',
+            align_corners=False
+        )

models/modules.py ADDED Viewed

	@@ -0,0 +1,258 @@

+"""
+ISDNet building blocks: STDC-like modules and Laplacian pyramid
+"""
+import os
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import init
+class ConvX(nn.Module):
+    """Basic conv-bn-relu block."""
+    def __init__(self, in_planes, out_planes, kernel=3, stride=1):
+        super().__init__()
+        self.conv = nn.Conv2d(
+            in_planes, out_planes,
+            kernel_size=kernel, stride=stride,
+            padding=kernel // 2, bias=False
+        )
+        self.bn = nn.SyncBatchNorm(out_planes)
+        self.relu = nn.ReLU(inplace=True)
+    def forward(self, x):
+        return self.relu(self.bn(self.conv(x)))
+class AddBottleneck(nn.Module):
+    """STDC AddBottleneck: residual addition fusion."""
+    def __init__(self, in_planes, out_planes, block_num=3, stride=1):
+        super().__init__()
+        self.conv_list = nn.ModuleList()
+        self.stride = stride
+        if stride == 2:
+            self.avd_layer = nn.Sequential(
+                nn.Conv2d(out_planes // 2, out_planes // 2, 3, 2, 1,
+                         groups=out_planes // 2, bias=False),
+                nn.SyncBatchNorm(out_planes // 2)
+            )
+            self.skip = nn.Sequential(
+                nn.Conv2d(in_planes, in_planes, 3, 2, 1, groups=in_planes, bias=False),
+                nn.SyncBatchNorm(in_planes),
+                nn.Conv2d(in_planes, out_planes, 1, bias=False),
+                nn.SyncBatchNorm(out_planes)
+            )
+            stride = 1
+        for idx in range(block_num):
+            if idx == 0:
+                self.conv_list.append(ConvX(in_planes, out_planes // 2, kernel=1))
+            elif idx == 1 and block_num == 2:
+                self.conv_list.append(ConvX(out_planes // 2, out_planes // 2, stride=stride))
+            elif idx == 1:
+                self.conv_list.append(ConvX(out_planes // 2, out_planes // 4, stride=stride))
+            elif idx < block_num - 1:
+                self.conv_list.append(
+                    ConvX(out_planes // int(math.pow(2, idx)),
+                          out_planes // int(math.pow(2, idx + 1)))
+                )
+            else:
+                self.conv_list.append(
+                    ConvX(out_planes // int(math.pow(2, idx)),
+                          out_planes // int(math.pow(2, idx)))
+                )
+    def forward(self, x):
+        out_list, out = [], x
+        for idx, conv in enumerate(self.conv_list):
+            if idx == 0 and self.stride == 2:
+                out = self.avd_layer(conv(out))
+            else:
+                out = conv(out)
+            out_list.append(out)
+        if self.stride == 2:
+            return torch.cat(out_list, dim=1) + self.skip(x)
+        return torch.cat(out_list, dim=1) + x
+class CatBottleneck(nn.Module):
+    """STDC CatBottleneck: concatenation fusion."""
+    def __init__(self, in_planes, out_planes, block_num=3, stride=1):
+        super().__init__()
+        self.conv_list = nn.ModuleList()
+        self.stride = stride
+        if stride == 2:
+            self.avd_layer = nn.Sequential(
+                nn.Conv2d(out_planes // 2, out_planes // 2, 3, 2, 1,
+                         groups=out_planes // 2, bias=False),
+                nn.SyncBatchNorm(out_planes // 2)
+            )
+            self.skip = nn.AvgPool2d(3, 2, 1)
+            stride = 1
+        for idx in range(block_num):
+            if idx == 0:
+                self.conv_list.append(ConvX(in_planes, out_planes // 2, kernel=1))
+            elif idx == 1 and block_num == 2:
+                self.conv_list.append(ConvX(out_planes // 2, out_planes // 2, stride=stride))
+            elif idx == 1:
+                self.conv_list.append(ConvX(out_planes // 2, out_planes // 4, stride=stride))
+            elif idx < block_num - 1:
+                self.conv_list.append(
+                    ConvX(out_planes // int(math.pow(2, idx)),
+                          out_planes // int(math.pow(2, idx + 1)))
+                )
+            else:
+                self.conv_list.append(
+                    ConvX(out_planes // int(math.pow(2, idx)),
+                          out_planes // int(math.pow(2, idx)))
+                )
+    def forward(self, x):
+        out_list = []
+        out1 = self.conv_list[0](x)
+        for idx, conv in enumerate(self.conv_list[1:]):
+            if idx == 0 and self.stride == 2:
+                out = conv(self.avd_layer(out1))
+            elif idx == 0:
+                out = conv(out1)
+            else:
+                out = conv(out)
+            out_list.append(out)
+        if self.stride == 2:
+            out_list.insert(0, self.skip(out1))
+        else:
+            out_list.insert(0, out1)
+        return torch.cat(out_list, dim=1)
+class ShallowNet(nn.Module):
+    """
+    STDC-like shallow network for high-resolution feature extraction.
+    Args:
+        base: Base channel number
+        in_channels: Input channels (3 for RGB, 6 for pyramid concat)
+        layers: Number of blocks per stage
+        block_num: Number of convs per block
+        type: 'cat' for CatBottleneck, 'add' for AddBottleneck
+        pretrain_model: Path to pretrained STDC weights
+    """
+    def __init__(self, base=64, in_channels=3, layers=[2, 2], block_num=4,
+                 type="cat", pretrain_model=''):
+        super().__init__()
+        block = CatBottleneck if type == "cat" else AddBottleneck
+        self.in_channels = in_channels
+        features = [
+            ConvX(in_channels, base // 2, 3, 2),
+            ConvX(base // 2, base, 3, 2)
+        ]
+        for i, layer in enumerate(layers):
+            for j in range(layer):
+                if i == 0 and j == 0:
+                    features.append(block(base, base * 4, block_num, 2))
+                elif j == 0:
+                    features.append(
+                        block(base * int(math.pow(2, i + 1)),
+                              base * int(math.pow(2, i + 2)), block_num, 2)
+                    )
+                else:
+                    features.append(
+                        block(base * int(math.pow(2, i + 2)),
+                              base * int(math.pow(2, i + 2)), block_num, 1)
+                    )
+        self.features = nn.Sequential(*features)
+        self.x2 = nn.Sequential(self.features[:1])
+        self.x4 = nn.Sequential(self.features[1:2])
+        self.x8 = nn.Sequential(self.features[2:4])
+        self.x16 = nn.Sequential(self.features[4:6])
+        if pretrain_model and os.path.exists(pretrain_model):
+            print(f'Loading pretrain model {pretrain_model}')
+            sd = torch.load(pretrain_model, weights_only=False)["state_dict"]
+            ssd = self.state_dict()
+            for k, v in sd.items():
+                if k == 'features.0.conv.weight' and in_channels != 3:
+                    v = torch.cat([v, v], dim=1)
+                if k in ssd:
+                    ssd.update({k: v})
+            self.load_state_dict(ssd, strict=False)
+        else:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    init.kaiming_normal_(m.weight, mode='fan_out')
+                elif isinstance(m, (nn.BatchNorm2d, nn.SyncBatchNorm)):
+                    init.constant_(m.weight, 1)
+                    init.constant_(m.bias, 0)
+    def forward(self, x):
+        x2 = self.x2(x)
+        x4 = self.x4(x2)
+        x8 = self.x8(x4)
+        x16 = self.x16(x8)
+        return x8, x16
+class Lap_Pyramid_Conv(nn.Module):
+    """
+    Laplacian pyramid decomposition.
+    Extracts high-frequency details at multiple scales.
+    """
+    def __init__(self, num_high=3, gauss_chl=3):
+        super().__init__()
+        self.num_high = num_high
+        self.gauss_chl = gauss_chl
+        k = torch.tensor([
+            [1., 4., 6., 4., 1],
+            [4., 16., 24., 16., 4.],
+            [6., 24., 36., 24., 6.],
+            [4., 16., 24., 16., 4.],
+            [1., 4., 6., 4., 1.]
+        ]) / 256.
+        self.register_buffer('kernel', k.repeat(gauss_chl, 1, 1, 1))
+    def conv_gauss(self, img, k):
+        return F.conv2d(F.pad(img, (2, 2, 2, 2), mode='reflect'), k, groups=img.shape[1])
+    def downsample(self, x):
+        return x[:, :, ::2, ::2]
+    def upsample(self, x):
+        cc = torch.cat([x, torch.zeros_like(x)], dim=3)
+        cc = cc.view(x.shape[0], x.shape[1], x.shape[2] * 2, x.shape[3])
+        cc = cc.permute(0, 1, 3, 2)
+        cc = torch.cat([cc, torch.zeros(x.shape[0], x.shape[1], x.shape[3],
+                                        x.shape[2] * 2, device=x.device)], dim=3)
+        cc = cc.view(x.shape[0], x.shape[1], x.shape[3] * 2, x.shape[2] * 2)
+        return self.conv_gauss(cc.permute(0, 1, 3, 2), 4 * self.kernel)
+    def pyramid_decom(self, img):
+        """Decompose image into Laplacian pyramid (high-frequency residuals)."""
+        current = img
+        pyr = []
+        for _ in range(self.num_high):
+            down = self.downsample(self.conv_gauss(current, self.kernel))
+            up = self.upsample(down)
+            if up.shape[2:] != current.shape[2:]:
+                up = F.interpolate(up, current.shape[2:])
+            pyr.append(current - up)
+            current = down
+        return pyr

utils/__init__.py ADDED Viewed

	@@ -0,0 +1,17 @@

+"""
+ISDNet utilities
+"""
+from .distributed import (
+    setup_distributed,
+    cleanup_distributed,
+    print_rank0,
+    batch_mm_loop,
+)
+__all__ = [
+    "setup_distributed",
+    "cleanup_distributed",
+    "print_rank0",
+    "batch_mm_loop",
+]

utils/distributed.py ADDED Viewed

	@@ -0,0 +1,56 @@

+"""
+Distributed training utilities
+"""
+import os
+import torch
+import torch.distributed as dist
+def setup_distributed():
+    """Initialize distributed training."""
+    if 'RANK' in os.environ:
+        rank = int(os.environ['RANK'])
+        world_size = int(os.environ['WORLD_SIZE'])
+        local_rank = int(os.environ['LOCAL_RANK'])
+    else:
+        rank = 0
+        world_size = 1
+        local_rank = 0
+    if world_size > 1:
+        dist.init_process_group('nccl')
+        torch.cuda.set_device(local_rank)
+    return rank, world_size, local_rank
+def cleanup_distributed():
+    """Cleanup distributed training."""
+    if dist.is_initialized():
+        dist.destroy_process_group()
+def print_rank0(msg, rank=0):
+    """Print only from rank 0."""
+    if rank == 0:
+        print(msg)
+def batch_mm_loop(a, b):
+    """
+    Batch matrix multiply using a loop over the batch dimension.
+    Avoids CUBLAS strided batched routines which have issues on L40S/CUDA 12.8/PyTorch 2.10.
+    Args:
+        a: Tensor of shape (batch, m, k)
+        b: Tensor of shape (batch, k, n)
+    Returns:
+        Tensor of shape (batch, m, n)
+    """
+    batch = a.shape[0]
+    results = []
+    for i in range(batch):
+        results.append(torch.mm(a[i], b[i]))
+    return torch.stack(results, dim=0)

weights/isdnet_flair_best.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:233b4a931fe370f395d0ce60d636036eefc35e596b09b1acfa54950d7f1d89e1
+size 142441755