Update with new experiments

Files changed (6) hide show

model/CASWiT_fusion_last_stage_add.py +250 -0
model/CASWiT_m2f.py +354 -0
model/CASWiT_segformer.py +290 -0
model/CASWiT_ssl.py +292 -0
model/CASWiT_upernet.py +250 -0
model/build_model.py +38 -0

model/CASWiT_fusion_last_stage_add.py ADDED Viewed

	@@ -0,0 +1,250 @@

+"""
+CASWiT: Context-Aware Swin Transformer for Ultra-High Resolution Semantic Segmentation
+This module implements the main CASWiT model architecture with dual-branch
+high-resolution and low-resolution processing with cross-attention fusion.
+"""
+import math
+from typing import Dict
+import torch
+import torch.nn as nn
+from transformers import UperNetForSemanticSegmentation
+from transformers.utils import logging as hf_logging
+hf_logging.set_verbosity_error()
+hf_logging.disable_progress_bar()
+class DropPath(nn.Module):
+    """Drop path (stochastic depth) regularization module."""
+    def __init__(self, drop_prob: float = 0.0):
+        super().__init__()
+        self.drop_prob = float(drop_prob)
+    def forward(self, x):
+        if self.drop_prob == 0.0 or (not self.training):
+            return x
+        keep = 1.0 - self.drop_prob
+        shape = (x.shape[0],) + (1,) * (x.ndim - 1)
+        mask = x.new_empty(shape).bernoulli_(keep).div_(keep)
+        return x * mask
+class CrossFusionBlock(nn.Module):
+    """
+    Cross-attention fusion block that enables HR features to attend to LR features.
+    Implements pre-norm cross-attention (Q=HR, K/V=LR).
+    Args:
+        C_hr: Channel dimension of HR features
+        C_lr: Channel dimension of LR features
+        num_heads: Number of attention heads
+        mlp_ratio: MLP expansion ratio
+        drop: Dropout rate
+        drop_path: Drop path rate
+    """
+    def __init__(self, C_hr: int, C_lr: int, num_heads: int = 8,
+                 mlp_ratio: float = 4.0, drop: float = 0.0, drop_path: float = 0.1):
+        super().__init__()
+        self.norm_q = nn.LayerNorm(C_hr)
+        self.norm_kv = nn.LayerNorm(C_lr)
+        self.attn = nn.MultiheadAttention(
+            embed_dim=C_hr, num_heads=num_heads, kdim=C_lr, vdim=C_lr,
+            dropout=drop, batch_first=True
+        )
+        hidden = int(C_hr * mlp_ratio)
+        self.mlp = nn.Sequential(
+            nn.LayerNorm(C_hr),
+            nn.Linear(C_hr, hidden),
+            nn.GELU(),
+            nn.Linear(hidden, C_hr),
+        )
+    def forward(self, x_hr: torch.Tensor, x_lr: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass through cross-attention fusion block.
+        Args:
+            x_hr: HR features [B, C_hr, H_hr, W_hr]
+            x_lr: LR features [B, C_lr, H_lr, W_lr]
+        Returns:
+            Fused HR features [B, C_hr, H_hr, W_hr]
+        """
+        B, C_hr, H_hr, W_hr = x_hr.shape
+        _, C_lr, H_lr, W_lr = x_lr.shape
+        # Flatten to sequences
+        q  = x_hr.flatten(2).transpose(1, 2)    # [B, N_hr, C_hr]
+        kv = x_lr.flatten(2).transpose(1, 2)    # [B, N_lr, C_lr]
+        # Pre-norm
+        qn  = self.norm_q(q)
+        kvn = self.norm_kv(kv)
+        attn_out, _ = self.attn(qn, kvn, kvn)  # [B, N_hr, C_hr]
+        # Residual connection + MLP
+        y = q + attn_out
+        y = y + self.mlp(y)
+        return y.transpose(1, 2).view(B, C_hr, H_hr, W_hr)
+class CASWiT(nn.Module):
+    """
+    CASWiT: Context-Aware Swin Transformer for Ultra-High Resolution Semantic Segmentation.
+    Dual-branch architecture with:
+    - HR branch: Processes high-resolution crops
+    - LR branch: Processes low-resolution context
+    - Cross-attention fusion at each encoder stage
+    Args:
+        num_head_xa: Number of cross-attention heads
+        num_classes: Number of segmentation classes
+        model_name: HuggingFace model identifier for UPerNet-Swin
+        mlp_ratio: MLP expansion ratio in fusion blocks
+        drop_path: Drop path rate
+    """
+    def __init__(self, num_head_xa: int = 1, num_classes: int = 12,
+                 model_name: str = "openmmlab/upernet-swin-tiny",
+                 mlp_ratio: float = 4.0, drop_path: float = 0.1):
+        super().__init__()
+        # Load two UPerNet backbones (HR and LR branches)
+        model_hr = UperNetForSemanticSegmentation.from_pretrained(
+            model_name, num_labels=num_classes, ignore_mismatched_sizes=True
+        )
+        model_lr = UperNetForSemanticSegmentation.from_pretrained(
+            model_name, num_labels=num_classes, ignore_mismatched_sizes=True
+        )
+        # Extract HR branch components
+        self.embeddings_hr = model_hr.backbone.embeddings
+        self.encoder_layers_hr = model_hr.backbone.encoder.layers
+        self.hidden_states_norms_hr = model_hr.backbone.hidden_states_norms
+        self.decoder = model_hr.decode_head
+        # Extract LR branch components
+        self.embeddings_lr = model_lr.backbone.embeddings
+        self.encoder_layers_lr = model_lr.backbone.encoder.layers
+        self.hidden_states_norms_lr = model_lr.backbone.hidden_states_norms
+        self.decoder_lr = model_lr.decode_head
+        # Cross-attention blocks at each stage
+        # Dimensions: tiny:[96, 192, 384, 768] base:[128, 256, 512, 1024] large:[192, 384, 768, 1536]
+        dims_map = {
+            "tiny": [96, 192, 384, 768],
+            "base": [128, 256, 512, 1024],
+            "large": [192, 384, 768, 1536]
+        }
+        # Infer dimensions from model name
+        if "tiny" in model_name.lower():
+            dims = dims_map["tiny"]
+        elif "large" in model_name.lower():
+            dims = dims_map["large"]
+        else:
+            dims = dims_map["base"]  # default to base
+    def forward(self, x_hr: torch.Tensor, x_lr: torch.Tensor) -> Dict[str, torch.Tensor]:
+        """
+        Forward pass through CASWiT model.
+        Args:
+            x_hr: HR input images [B, 3, H_hr, W_hr]
+            x_lr: LR input images [B, 3, H_lr, W_lr]
+        Returns:
+            Dictionary with 'logits_hr' and 'logits_lr' segmentation logits
+        """
+        B = x_hr.size(0)
+        # Patch embeddings
+        x_hr_seq, _ = self.embeddings_hr(x_hr)
+        x_lr_seq, _ = self.embeddings_lr(x_lr)
+        N_hr, C_hr = x_hr_seq.shape[1], x_hr_seq.shape[2]
+        N_lr, C_lr = x_lr_seq.shape[1], x_lr_seq.shape[2]
+        H_hr = W_hr = int(math.sqrt(N_hr))
+        H_lr = W_lr = int(math.sqrt(N_lr))
+        dims_hr = (H_hr, W_hr)
+        dims_lr = (H_lr, W_lr)
+        features_hr: Dict[str, torch.Tensor] = {}
+        features_lr: Dict[str, torch.Tensor] = {}
+        # Process through encoder stages with cross-attention fusion
+        for idx, (stage_hr, stage_lr) in enumerate(zip(
+            self.encoder_layers_hr, self.encoder_layers_lr
+        )):
+            # HR branch blocks
+            for block in stage_hr.blocks:
+                x_hr_seq = block(x_hr_seq, dims_hr)
+                if isinstance(x_hr_seq, tuple):
+                    x_hr_seq = x_hr_seq[0]
+            # LR branch blocks
+            for block in stage_lr.blocks:
+                x_lr_seq = block(x_lr_seq, dims_lr)
+                if isinstance(x_lr_seq, tuple):
+                    x_lr_seq = x_lr_seq[0]
+            # Layer normalization
+            x_hr_seq = self.hidden_states_norms_hr[f"stage{idx+1}"](x_hr_seq)
+            x_lr_seq = self.hidden_states_norms_lr[f"stage{idx+1}"](x_lr_seq)
+            H_hr, W_hr = dims_hr
+            H_lr, W_lr = dims_lr
+            C_hr = x_hr_seq.shape[-1]
+            C_lr = x_lr_seq.shape[-1]
+            # Reshape to spatial format
+            feat_hr = x_hr_seq.transpose(1, 2).contiguous().view(B, C_hr, H_hr, W_hr)
+            feat_lr = x_lr_seq.transpose(1, 2).contiguous().view(B, C_lr, H_lr, W_lr)
+            # Cross-attend HR to LR
+            if idx == 3:
+                fused_hr = feat_hr + feat_lr
+            else:
+                fused_hr = feat_hr
+            #fused_hr = ca(feat_hr, feat_lr)
+            fused_hr_seq = fused_hr.flatten(2).transpose(1, 2).contiguous()
+            # Downsample if stage has it
+            if stage_hr.downsample is not None:
+                fused_hr_seq = stage_hr.downsample(fused_hr_seq, dims_hr)
+                dims_hr = (dims_hr[0] // 2, dims_hr[1] // 2)
+            if stage_lr.downsample is not None:
+                x_lr_seq = stage_lr.downsample(x_lr_seq, dims_lr)
+                dims_lr = (dims_lr[0] // 2, dims_lr[1] // 2)
+            features_hr[f"stage{idx+1}"] = fused_hr
+            features_lr[f"stage{idx+1}"] = feat_lr
+            x_hr_seq = fused_hr_seq
+        # Decode HR features
+        features_tuple = (
+            features_hr["stage1"],
+            features_hr["stage2"],
+            features_hr["stage3"],
+            features_hr["stage4"],
+        )
+        logits = self.decoder(features_tuple)
+        # Decode LR features (for auxiliary supervision)
+        features_tuple_lr = (
+            features_lr["stage1"],
+            features_lr["stage2"],
+            features_lr["stage3"],
+            features_lr["stage4"],
+        )
+        logits_lr = self.decoder_lr(features_tuple_lr)
+        return {"logits_hr": logits, "logits_lr": logits_lr}

model/CASWiT_m2f.py ADDED Viewed

	@@ -0,0 +1,354 @@

+"""
+CASWiT with Mask2Former heads (HuggingFace).
+This file is identical to the original CASWiT implementation except that:
+- self.decoder and self.decoder_lr are replaced by a Mask2Former semantic head
+  implemented using HuggingFace's Mask2Former pixel decoder + transformer module.
+The rest of the model (embeddings, Swin encoder stages, cross-attention fusion) is unchanged.
+"""
+import math
+from typing import Dict, Tuple, List
+import torch
+import torch.nn as nn
+from transformers import UperNetForSemanticSegmentation, Mask2FormerConfig
+from transformers.models.mask2former.modeling_mask2former import (
+    Mask2FormerPixelDecoder,
+    Mask2FormerTransformerModule,
+)
+from transformers.utils import logging as hf_logging
+hf_logging.set_verbosity_error()
+hf_logging.disable_progress_bar()
+class DropPath(nn.Module):
+    """Drop path (stochastic depth) regularization module."""
+    def __init__(self, drop_prob: float = 0.0):
+        super().__init__()
+        self.drop_prob = float(drop_prob)
+    def forward(self, x):
+        if self.drop_prob == 0.0 or (not self.training):
+            return x
+        keep = 1.0 - self.drop_prob
+        shape = (x.shape[0],) + (1,) * (x.ndim - 1)
+        mask = x.new_empty(shape).bernoulli_(keep).div_(keep)
+        return x * mask
+class CrossFusionBlock(nn.Module):
+    """
+    Cross-attention fusion block that enables HR features to attend to LR features.
+    Implements pre-norm cross-attention (Q=HR, K/V=LR).
+    """
+    def __init__(self, C_hr: int, C_lr: int, num_heads: int = 8,
+                 mlp_ratio: float = 4.0, drop: float = 0.0, drop_path: float = 0.1):
+        super().__init__()
+        self.norm_q = nn.LayerNorm(C_hr)
+        self.norm_kv = nn.LayerNorm(C_lr)
+        self.attn = nn.MultiheadAttention(
+            embed_dim=C_hr, num_heads=num_heads, kdim=C_lr, vdim=C_lr,
+            dropout=drop, batch_first=True
+        )
+        hidden = int(C_hr * mlp_ratio)
+        self.mlp = nn.Sequential(
+            nn.LayerNorm(C_hr),
+            nn.Linear(C_hr, hidden),
+            nn.GELU(),
+            nn.Linear(hidden, C_hr),
+        )
+    def forward(self, x_hr: torch.Tensor, x_lr: torch.Tensor) -> torch.Tensor:
+        B, C_hr, H_hr, W_hr = x_hr.shape
+        q  = x_hr.flatten(2).transpose(1, 2)    # [B, N_hr, C_hr]
+        kv = x_lr.flatten(2).transpose(1, 2)    # [B, N_lr, C_lr]
+        qn  = self.norm_q(q)
+        kvn = self.norm_kv(kv)
+        attn_out, _ = self.attn(qn, kvn, kvn)  # [B, N_hr, C_hr]
+        y = q + attn_out
+        y = y + self.mlp(y)
+        return y.transpose(1, 2).view(B, C_hr, H_hr, W_hr)
+class Mask2FormerSemanticHead(nn.Module):
+    """
+    A minimal Mask2Former "semantic segmentation head" that consumes multi-scale backbone features
+    and outputs per-class per-pixel scores.
+    Input:
+        features: tuple/list of 4 feature maps (stage1..stage4), each [B, C_i, H_i, W_i].
+                 The spatial strides should typically be [4, 8, 16, 32] relative to the input image.
+    Output:
+        semantic_scores: [B, num_classes, H_out, W_out], where H_out/W_out match the mask_features
+                         resolution produced by Mask2Former pixel decoder (typically stride 4).
+    Notes:
+        Mask2Former natively predicts:
+          - class_queries_logits: [B, Q, num_classes+1] (includes "no object")
+          - masks_queries_logits: [B, Q, H_out, W_out]
+        For semantic segmentation, a common aggregation is:
+          semantic_probs = sum_q softmax(class_logits_q)[c] * sigmoid(mask_logits_q)[h,w]
+        Here we return these aggregated per-class *scores* (in [0,1]) as "logits" for compatibility
+        with the original CASWiT API. If you need true logits, apply logit() carefully (numerical stability).
+    """
+    def __init__(
+        self,
+        feature_channels: List[int],
+        num_classes: int,
+        num_queries: int = 100,
+        feature_size: int = 256,
+        mask_feature_size: int = 256,
+        common_stride: int = 4,
+    ):
+        super().__init__()
+        cfg = Mask2FormerConfig(
+            num_labels=num_classes,
+            num_queries=num_queries,
+            feature_size=feature_size,
+            mask_feature_size=mask_feature_size,
+            common_stride=common_stride,
+            feature_strides=[4, 8, 16, 32],
+            encoder_layers=1,
+            decoder_layers=1,
+            num_attention_heads=8,
+            dim_feedforward=1024,
+            output_auxiliary_logits=False,
+            # keep defaults for transformer, heads, etc.
+        )
+        self.config = cfg
+        self.num_classes = num_classes
+        self.num_queries = num_queries
+        # Pixel decoder consumes backbone channels and produces:
+        # - multi_scale_features (3 levels: 1/8,1/16,1/32)
+        # - mask_features (typically 1/4)
+        self.pixel_decoder = Mask2FormerPixelDecoder(cfg, feature_channels=feature_channels)
+        # Transformer module consumes:
+        # - multi_scale_features (list of 3 tensors)
+        # - mask_features (tensor at stride 4)
+        # and returns masks_queries_logits for each decoder layer + intermediate states
+        self.transformer_module = Mask2FormerTransformerModule(in_features=cfg.feature_size, config=cfg)
+        # Class predictor (same idea as HF Mask2FormerForUniversalSegmentation)
+        self.class_predictor = nn.Linear(cfg.hidden_dim, num_classes + 1)
+    def forward(self, features: Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]) -> torch.Tensor:
+        if not isinstance(features, (tuple, list)) or len(features) != 4:
+            raise ValueError("Mask2FormerSemanticHead expects a tuple/list of 4 feature maps: (stage1, stage2, stage3, stage4).")
+        # Expected order: [stage1, stage2, stage3, stage4] (increasing stride).
+        # Pixel decoder internally reverses and uses the last 3 feature maps for deformable attention.
+        pixel_out = self.pixel_decoder(list(features), return_dict=True)
+        multi_scale = list(pixel_out.multi_scale_features)   # 3 levels
+        mask_features = pixel_out.mask_features              # stride 4
+        dec_out = self.transformer_module(
+            multi_scale_features=multi_scale,
+            mask_features=mask_features,
+            output_hidden_states=True,
+            output_attentions=False,
+        )
+        # Use last decoder layer predictions
+        masks_queries_logits = dec_out.masks_queries_logits[-1]  # [B, Q, H, W]
+        # Last layer hidden state can be in shape [B, Q, D] OR [Q, B, D] depending on HF internals.
+        # For intermediate_hidden_states, HF uses [Q, B, D] (then transposes in their heads).
+        # We'll robustly support both:
+        hidden = dec_out.last_hidden_state
+        if hidden.dim() != 3:
+            raise RuntimeError(f"Unexpected last_hidden_state shape: {tuple(hidden.shape)}")
+        if hidden.shape[0] == self.num_queries and hidden.shape[1] == masks_queries_logits.shape[0]:
+            # [Q, B, D] -> [B, Q, D]
+            hidden_bqd = hidden.transpose(0, 1)
+        else:
+            # assume [B, Q, D]
+            hidden_bqd = hidden
+        class_queries_logits = self.class_predictor(hidden_bqd)  # [B, Q, C+1]
+        # Aggregate to semantic per-class scores at mask resolution:
+        # softmax over classes (including no-object), then drop no-object channel
+        class_probs = class_queries_logits.softmax(dim=-1)[..., :-1]  # [B, Q, C]
+        mask_probs = masks_queries_logits.sigmoid()                   # [B, Q, H, W]
+        # semantic_scores[b,c,h,w] = sum_q class_probs[b,q,c] * mask_probs[b,q,h,w]
+        semantic_scores = torch.einsum("bqc,bqhw->bchw", class_probs, mask_probs)
+        return semantic_scores
+class CASWiT(nn.Module):
+    """
+    CASWiT: Context-Aware Swin Transformer for Ultra-High Resolution Semantic Segmentation.
+    Only change vs original: replace self.decoder and self.decoder_lr with Mask2FormerSemanticHead.
+    """
+    def __init__(self, num_head_xa: int = 1, num_classes: int = 12,
+                 model_name: str = "openmmlab/upernet-swin-tiny",
+                 mlp_ratio: float = 4.0, drop_path: float = 0.1):
+        super().__init__()
+        model_hr = UperNetForSemanticSegmentation.from_pretrained(
+            model_name, num_labels=num_classes, ignore_mismatched_sizes=True
+        )
+        model_lr = UperNetForSemanticSegmentation.from_pretrained(
+            model_name, num_labels=num_classes, ignore_mismatched_sizes=True
+        )
+        # Extract HR branch components
+        self.embeddings_hr = model_hr.backbone.embeddings
+        self.encoder_layers_hr = model_hr.backbone.encoder.layers
+        self.hidden_states_norms_hr = model_hr.backbone.hidden_states_norms
+        # Extract LR branch components
+        self.embeddings_lr = model_lr.backbone.embeddings
+        self.encoder_layers_lr = model_lr.backbone.encoder.layers
+        self.hidden_states_norms_lr = model_lr.backbone.hidden_states_norms
+        # Infer Swin stage dims from model name (same as original)
+        dims_map = {
+            "tiny": [96, 192, 384, 768],
+            "base": [128, 256, 512, 1024],
+            "large": [192, 384, 768, 1536]
+        }
+        if "tiny" in model_name.lower():
+            dims = dims_map["tiny"]
+        elif "large" in model_name.lower():
+            dims = dims_map["large"]
+        else:
+            dims = dims_map["base"]
+        # >>> ONLY MODIFIED PART: decoder / decoder_lr <<<
+        self.decoder = Mask2FormerSemanticHead(feature_channels=dims, num_classes=num_classes)
+        self.decoder_lr = Mask2FormerSemanticHead(feature_channels=dims, num_classes=num_classes)
+        # Cross-attention blocks at each stage
+        self.cross_attn_blocks = nn.ModuleList([
+            CrossFusionBlock(dim, dim, num_heads=num_head_xa,
+                           mlp_ratio=mlp_ratio, drop=0.0, drop_path=drop_path)
+            for dim in dims
+        ])
+    def forward(self, x_hr: torch.Tensor, x_lr: torch.Tensor) -> Dict[str, torch.Tensor]:
+        B = x_hr.size(0)
+        # Patch embeddings
+        x_hr_seq, _ = self.embeddings_hr(x_hr)
+        x_lr_seq, _ = self.embeddings_lr(x_lr)
+        N_hr = x_hr_seq.shape[1]
+        N_lr = x_lr_seq.shape[1]
+        H_hr = W_hr = int(math.sqrt(N_hr))
+        H_lr = W_lr = int(math.sqrt(N_lr))
+        dims_hr = (H_hr, W_hr)
+        dims_lr = (H_lr, W_lr)
+        features_hr: Dict[str, torch.Tensor] = {}
+        features_lr: Dict[str, torch.Tensor] = {}
+        for idx, (stage_hr, stage_lr, ca) in enumerate(zip(
+            self.encoder_layers_hr, self.encoder_layers_lr, self.cross_attn_blocks
+        )):
+            for block in stage_hr.blocks:
+                x_hr_seq = block(x_hr_seq, dims_hr)
+                if isinstance(x_hr_seq, tuple):
+                    x_hr_seq = x_hr_seq[0]
+            for block in stage_lr.blocks:
+                x_lr_seq = block(x_lr_seq, dims_lr)
+                if isinstance(x_lr_seq, tuple):
+                    x_lr_seq = x_lr_seq[0]
+            x_hr_seq = self.hidden_states_norms_hr[f"stage{idx+1}"](x_hr_seq)
+            x_lr_seq = self.hidden_states_norms_lr[f"stage{idx+1}"](x_lr_seq)
+            H_hr, W_hr = dims_hr
+            H_lr, W_lr = dims_lr
+            C_hr = x_hr_seq.shape[-1]
+            C_lr = x_lr_seq.shape[-1]
+            feat_hr = x_hr_seq.transpose(1, 2).contiguous().view(B, C_hr, H_hr, W_hr)
+            feat_lr = x_lr_seq.transpose(1, 2).contiguous().view(B, C_lr, H_lr, W_lr)
+            fused_hr = ca(feat_hr, feat_lr)
+            fused_hr_seq = fused_hr.flatten(2).transpose(1, 2).contiguous()
+            if stage_hr.downsample is not None:
+                fused_hr_seq = stage_hr.downsample(fused_hr_seq, dims_hr)
+                dims_hr = (dims_hr[0] // 2, dims_hr[1] // 2)
+            if stage_lr.downsample is not None:
+                x_lr_seq = stage_lr.downsample(x_lr_seq, dims_lr)
+                dims_lr = (dims_lr[0] // 2, dims_lr[1] // 2)
+            features_hr[f"stage{idx+1}"] = fused_hr
+            features_lr[f"stage{idx+1}"] = feat_lr
+            x_hr_seq = fused_hr_seq
+        # Decode HR features
+        features_tuple = (
+            features_hr["stage1"],
+            features_hr["stage2"],
+            features_hr["stage3"],
+            features_hr["stage4"],
+        )
+        logits = self.decoder(features_tuple)
+        # Decode LR features
+        features_tuple_lr = (
+            features_lr["stage1"],
+            features_lr["stage2"],
+            features_lr["stage3"],
+            features_lr["stage4"],
+        )
+        logits_lr = self.decoder_lr(features_tuple_lr)
+        return {"logits_hr": logits, "logits_lr": logits_lr}
+def _test_mask2former_head():
+    """
+    Minimal sanity test: validates that the Mask2FormerSemanticHead consumes
+    a (stage1..stage4) feature tuple and returns [B, C, H1, W1] scores.
+    """
+    torch.manual_seed(0)
+    B = 1
+    num_classes = 12
+    dims = [96, 192, 384, 768]
+    H1, W1 = 8, 8
+    feats = (
+        torch.randn(B, dims[0], H1, W1),
+        torch.randn(B, dims[1], H1 // 2, W1 // 2),
+        torch.randn(B, dims[2], H1 // 4, W1 // 4),
+        torch.randn(B, dims[3], H1 // 8, W1 // 8),
+    )
+    head = Mask2FormerSemanticHead(feature_channels=dims, num_classes=num_classes, num_queries=50)
+    with torch.no_grad():
+        out = head(feats)
+    assert out.shape == (B, num_classes, H1, W1), f"Unexpected output shape: {out.shape}"
+    assert torch.isfinite(out).all(), "NaN/Inf in output"
+    return out.shape
+if __name__ == "__main__":
+    # Run head test
+    print("Mask2Former head test output shape:", _test_mask2former_head())

model/CASWiT_segformer.py ADDED Viewed

	@@ -0,0 +1,290 @@

+"""
+CASWiT: Context-Aware Swin Transformer for Ultra-High Resolution Semantic Segmentation
+This module implements the main CASWiT model architecture with dual-branch
+high-resolution and low-resolution processing with cross-attention fusion.
+"""
+import math
+from typing import Dict
+import torch
+import torch.nn as nn
+from transformers import UperNetForSemanticSegmentation, SegformerConfig
+from transformers.models.segformer.modeling_segformer import SegformerDecodeHead
+from transformers.utils import logging as hf_logging
+hf_logging.set_verbosity_error()
+hf_logging.disable_progress_bar()
+class DropPath(nn.Module):
+    """Drop path (stochastic depth) regularization module."""
+    def __init__(self, drop_prob: float = 0.0):
+        super().__init__()
+        self.drop_prob = float(drop_prob)
+    def forward(self, x):
+        if self.drop_prob == 0.0 or (not self.training):
+            return x
+        keep = 1.0 - self.drop_prob
+        shape = (x.shape[0],) + (1,) * (x.ndim - 1)
+        mask = x.new_empty(shape).bernoulli_(keep).div_(keep)
+        return x * mask
+class CrossFusionBlock(nn.Module):
+    """
+    Cross-attention fusion block that enables HR features to attend to LR features.
+    Implements pre-norm cross-attention (Q=HR, K/V=LR).
+    Args:
+        C_hr: Channel dimension of HR features
+        C_lr: Channel dimension of LR features
+        num_heads: Number of attention heads
+        mlp_ratio: MLP expansion ratio
+        drop: Dropout rate
+        drop_path: Drop path rate
+    """
+    def __init__(self, C_hr: int, C_lr: int, num_heads: int = 8,
+                 mlp_ratio: float = 4.0, drop: float = 0.0, drop_path: float = 0.1):
+        super().__init__()
+        self.norm_q = nn.LayerNorm(C_hr)
+        self.norm_kv = nn.LayerNorm(C_lr)
+        self.attn = nn.MultiheadAttention(
+            embed_dim=C_hr, num_heads=num_heads, kdim=C_lr, vdim=C_lr,
+            dropout=drop, batch_first=True
+        )
+        hidden = int(C_hr * mlp_ratio)
+        self.mlp = nn.Sequential(
+            nn.LayerNorm(C_hr),
+            nn.Linear(C_hr, hidden),
+            nn.GELU(),
+            nn.Linear(hidden, C_hr),
+        )
+    def forward(self, x_hr: torch.Tensor, x_lr: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass through cross-attention fusion block.
+        Args:
+            x_hr: HR features [B, C_hr, H_hr, W_hr]
+            x_lr: LR features [B, C_lr, H_lr, W_lr]
+        Returns:
+            Fused HR features [B, C_hr, H_hr, W_hr]
+        """
+        B, C_hr, H_hr, W_hr = x_hr.shape
+        _, C_lr, H_lr, W_lr = x_lr.shape
+        # Flatten to sequences
+        q  = x_hr.flatten(2).transpose(1, 2)    # [B, N_hr, C_hr]
+        kv = x_lr.flatten(2).transpose(1, 2)    # [B, N_lr, C_lr]
+        # Pre-norm
+        qn  = self.norm_q(q)
+        kvn = self.norm_kv(kv)
+        attn_out, _ = self.attn(qn, kvn, kvn)  # [B, N_hr, C_hr]
+        # Residual connection + MLP
+        y = q + attn_out
+        y = y + self.mlp(y)
+        return y.transpose(1, 2).view(B, C_hr, H_hr, W_hr)
+class CASWiT(nn.Module):
+    """
+    CASWiT: Context-Aware Swin Transformer for Ultra-High Resolution Semantic Segmentation.
+    Dual-branch architecture with:
+    - HR branch: Processes high-resolution crops
+    - LR branch: Processes low-resolution context
+    - Cross-attention fusion at each encoder stage
+    Args:
+        num_head_xa: Number of cross-attention heads
+        num_classes: Number of segmentation classes
+        model_name: HuggingFace model identifier for UPerNet-Swin
+        mlp_ratio: MLP expansion ratio in fusion blocks
+        drop_path: Drop path rate
+    """
+    def __init__(self, num_head_xa: int = 1, num_classes: int = 12,
+                 model_name: str = "openmmlab/upernet-swin-tiny",
+                 mlp_ratio: float = 4.0, drop_path: float = 0.1):
+        super().__init__()
+        # Load two UPerNet backbones (HR and LR branches)
+        model_hr = UperNetForSemanticSegmentation.from_pretrained(
+            model_name, num_labels=num_classes, ignore_mismatched_sizes=True
+        )
+        model_lr = UperNetForSemanticSegmentation.from_pretrained(
+            model_name, num_labels=num_classes, ignore_mismatched_sizes=True
+        )
+        # Extract HR branch components
+        self.embeddings_hr = model_hr.backbone.embeddings
+        self.encoder_layers_hr = model_hr.backbone.encoder.layers
+        self.hidden_states_norms_hr = model_hr.backbone.hidden_states_norms
+        self.decoder = None  # placeholder, set after dims inference
+        # Extract LR branch components
+        self.embeddings_lr = model_lr.backbone.embeddings
+        self.encoder_layers_lr = model_lr.backbone.encoder.layers
+        self.hidden_states_norms_lr = model_lr.backbone.hidden_states_norms
+        self.decoder_lr = None  # placeholder, set after dims inference
+        # Cross-attention blocks at each stage
+        # Dimensions: tiny:[96, 192, 384, 768] base:[128, 256, 512, 1024] large:[192, 384, 768, 1536]
+        dims_map = {
+            "tiny": [96, 192, 384, 768],
+            "base": [128, 256, 512, 1024],
+            "large": [192, 384, 768, 1536]
+        }
+        # Infer dimensions from model name
+        if "tiny" in model_name.lower():
+            dims = dims_map["tiny"]
+        elif "large" in model_name.lower():
+            dims = dims_map["large"]
+        else:
+            dims = dims_map["base"]  # default to base
+        segformer_cfg = SegformerConfig(
+            num_labels=num_classes,
+            hidden_sizes=dims,
+            num_encoder_blocks=4,
+            decoder_hidden_size=512,
+            classifier_dropout_prob=0.0,
+        )
+        self.decoder = SegformerDecodeHead(segformer_cfg)
+        self.decoder_lr = SegformerDecodeHead(segformer_cfg)
+        self.cross_attn_blocks = nn.ModuleList([
+            CrossFusionBlock(dim, dim, num_heads=num_head_xa,
+                           mlp_ratio=mlp_ratio, drop=0.0, drop_path=drop_path)
+            for dim in dims
+        ])
+    def forward(self, x_hr: torch.Tensor, x_lr: torch.Tensor) -> Dict[str, torch.Tensor]:
+        """
+        Forward pass through CASWiT model.
+        Args:
+            x_hr: HR input images [B, 3, H_hr, W_hr]
+            x_lr: LR input images [B, 3, H_lr, W_lr]
+        Returns:
+            Dictionary with 'logits_hr' and 'logits_lr' segmentation logits
+        """
+        B = x_hr.size(0)
+        # Patch embeddings
+        x_hr_seq, _ = self.embeddings_hr(x_hr)
+        x_lr_seq, _ = self.embeddings_lr(x_lr)
+        N_hr, C_hr = x_hr_seq.shape[1], x_hr_seq.shape[2]
+        N_lr, C_lr = x_lr_seq.shape[1], x_lr_seq.shape[2]
+        H_hr = W_hr = int(math.sqrt(N_hr))
+        H_lr = W_lr = int(math.sqrt(N_lr))
+        dims_hr = (H_hr, W_hr)
+        dims_lr = (H_lr, W_lr)
+        features_hr: Dict[str, torch.Tensor] = {}
+        features_lr: Dict[str, torch.Tensor] = {}
+        # Process through encoder stages with cross-attention fusion
+        for idx, (stage_hr, stage_lr, ca) in enumerate(zip(
+            self.encoder_layers_hr, self.encoder_layers_lr, self.cross_attn_blocks
+        )):
+            # HR branch blocks
+            for block in stage_hr.blocks:
+                x_hr_seq = block(x_hr_seq, dims_hr)
+                if isinstance(x_hr_seq, tuple):
+                    x_hr_seq = x_hr_seq[0]
+            # LR branch blocks
+            for block in stage_lr.blocks:
+                x_lr_seq = block(x_lr_seq, dims_lr)
+                if isinstance(x_lr_seq, tuple):
+                    x_lr_seq = x_lr_seq[0]
+            # Layer normalization
+            x_hr_seq = self.hidden_states_norms_hr[f"stage{idx+1}"](x_hr_seq)
+            x_lr_seq = self.hidden_states_norms_lr[f"stage{idx+1}"](x_lr_seq)
+            H_hr, W_hr = dims_hr
+            H_lr, W_lr = dims_lr
+            C_hr = x_hr_seq.shape[-1]
+            C_lr = x_lr_seq.shape[-1]
+            # Reshape to spatial format
+            feat_hr = x_hr_seq.transpose(1, 2).contiguous().view(B, C_hr, H_hr, W_hr)
+            feat_lr = x_lr_seq.transpose(1, 2).contiguous().view(B, C_lr, H_lr, W_lr)
+            fused_hr = ca(feat_hr, feat_lr)
+            fused_hr_seq = fused_hr.flatten(2).transpose(1, 2).contiguous()
+            # Downsample if stage has it
+            if stage_hr.downsample is not None:
+                fused_hr_seq = stage_hr.downsample(fused_hr_seq, dims_hr)
+                dims_hr = (dims_hr[0] // 2, dims_hr[1] // 2)
+            if stage_lr.downsample is not None:
+                x_lr_seq = stage_lr.downsample(x_lr_seq, dims_lr)
+                dims_lr = (dims_lr[0] // 2, dims_lr[1] // 2)
+            features_hr[f"stage{idx+1}"] = fused_hr
+            features_lr[f"stage{idx+1}"] = feat_lr
+            x_hr_seq = fused_hr_seq
+        # Decode HR features
+        features_tuple = (
+            features_hr["stage1"],
+            features_hr["stage2"],
+            features_hr["stage3"],
+            features_hr["stage4"],
+        )
+        logits = self.decoder(features_tuple)
+        # Decode LR features (for auxiliary supervision)
+        features_tuple_lr = (
+            features_lr["stage1"],
+            features_lr["stage2"],
+            features_lr["stage3"],
+            features_lr["stage4"],
+        )
+        logits_lr = self.decoder_lr(features_tuple_lr)
+        return {"logits_hr": logits, "logits_lr": logits_lr}
+def _test_segformer_head():
+    """Quick, offline test for SegFormer head input/output shapes."""
+    # Example dims for Swin-Tiny stages:
+    dims = [96, 192, 384, 768]
+    cfg = SegformerConfig(
+        num_labels=7,
+        hidden_sizes=dims,
+        num_encoder_blocks=4,
+        decoder_hidden_size=512,
+        classifier_dropout_prob=0.0,
+    )
+    head = SegformerDecodeHead(cfg)
+    B = 2
+    # Stage resolutions typically differ by /2 each time; here we mimic that.
+    f1 = torch.randn(B, dims[0], 128, 128)
+    f2 = torch.randn(B, dims[1], 64, 64)
+    f3 = torch.randn(B, dims[2], 32, 32)
+    f4 = torch.randn(B, dims[3], 16, 16)
+    logits = head((f1, f2, f3, f4))
+    assert logits.shape == (B, cfg.num_labels, 128, 128), f"Unexpected logits shape: {logits.shape}"
+    return logits.shape
+if __name__ == "__main__":
+    print("SegFormer head test logits shape:", _test_segformer_head())

model/CASWiT_ssl.py ADDED Viewed

	@@ -0,0 +1,292 @@

+"""
+CASWiT Self-Supervised Learning (SSL) Module
+Implements SimMIM-based self-supervised pre-training for CASWiT using
+masked image modeling with dual-branch HR/LR processing.
+"""
+import math
+from typing import Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import UperNetForSemanticSegmentation
+from transformers.utils import logging as hf_logging
+hf_logging.set_verbosity_error()
+hf_logging.disable_progress_bar()
+def random_masking_with_tokens(x: torch.Tensor, mask_ratio: float = 0.75,
+                                mask_token: Optional[torch.Tensor] = None):
+    """
+    Random masking at token level with learned mask token.
+    Args:
+        x: Input tokens [B, N, C]
+        mask_ratio: Ratio of tokens to mask
+        mask_token: Learnable mask token
+    Returns:
+        x_masked: Masked tokens [B, N, C]
+        mask: Binary mask [B, N] where 0=visible, 1=masked
+        ids_restore: Indices to restore original order
+    """
+    B, N, C = x.shape
+    len_keep = int(N * (1 - mask_ratio))
+    noise = torch.rand(B, N, device=x.device)
+    ids_shuffle = torch.argsort(noise, dim=1)
+    ids_restore = torch.argsort(ids_shuffle, dim=1)
+    ids_keep = ids_shuffle[:, :len_keep]
+    x_keep = torch.gather(x, 1, ids_keep.unsqueeze(-1).expand(-1, -1, C))
+    if mask_token is None:
+        mask_token = torch.zeros((1, C), device=x.device)
+    m_tok = mask_token.view(1, 1, C).expand(B, N - len_keep, C)
+    x_cat = torch.cat([x_keep, m_tok], dim=1)
+    x_masked = torch.gather(x_cat, 1, ids_restore.unsqueeze(-1).expand(-1, -1, C))
+    mask = torch.ones(B, N, device=x.device)
+    mask[:, :len_keep] = 0
+    mask = torch.gather(mask, 1, ids_restore)
+    return x_masked, mask, ids_restore
+def center_masking_with_tokens(x: torch.Tensor, mask_token: Optional[torch.Tensor] = None,
+                                mask_ratio: float = 0.5):
+    """
+    Deterministic centered square mask.
+    Args:
+        x: Input tokens [B, N, C]
+        mask_token: Learnable mask token
+        mask_ratio: Ratio of tokens to mask
+    Returns:
+        x_masked: Masked tokens [B, N, C]
+        mask: Binary mask [B, N]
+        ids_restore: Indices to restore original order
+    """
+    B, N, C = x.shape
+    H = W = int(N**0.5)
+    assert H * W == N, "N must be a perfect square"
+    L = int(round(H * (mask_ratio ** 0.5)))
+    start = (H - L) // 2
+    end = start + L
+    mask_2d = torch.zeros(H, W, device=x.device, dtype=torch.bool)
+    mask_2d[start:end, start:end] = True
+    mask = mask_2d.view(1, -1).expand(B, -1)  # (B,N)
+    if mask_token is None:
+        mask_token = torch.zeros(C, device=x.device)
+    mask_token = mask_token.view(-1)
+    x_masked = x * (~mask).unsqueeze(-1) + mask.unsqueeze(-1) * mask_token.view(1, 1, C)
+    ids_restore = torch.arange(N, device=x.device).unsqueeze(0).expand(B, N)
+    return x_masked, mask.to(x_masked.dtype), ids_restore
+class CrossAttentionBlock(nn.Module):
+    """Simplified cross-attention block for SSL."""
+    def __init__(self, C_hr, C_lr, num_heads=8, dropout=0.0):
+        super().__init__()
+        self.cross_attn = nn.MultiheadAttention(
+            embed_dim=C_hr, num_heads=num_heads, kdim=C_lr, vdim=C_lr,
+            dropout=dropout, batch_first=True
+        )
+        self.norm = nn.LayerNorm(C_hr)
+        self.mlp = nn.Sequential(
+            nn.LayerNorm(C_hr),
+            nn.Linear(C_hr, C_hr * 4),
+            nn.GELU(),
+            nn.Linear(C_hr * 4, C_hr),
+        )
+    def forward(self, x_hr, x_lr):
+        B, C_hr, H_hr, W_hr = x_hr.shape
+        _, C_lr, H_lr, W_lr = x_lr.shape
+        q = x_hr.flatten(2).transpose(1, 2)  # (B,N_hr,C_hr)
+        kv = x_lr.flatten(2).transpose(1, 2)  # (B,N_lr,C_lr)
+        attn_out, _ = self.cross_attn(q, kv, kv)
+        y = self.norm(q + attn_out)
+        y = y + self.mlp(y)
+        return y.transpose(1, 2).view(B, C_hr, H_hr, W_hr)
+class CASWiT_SSL(nn.Module):
+    """
+    CASWiT Self-Supervised Learning model using SimMIM.
+    Encoder: Dual Swin backbones with cross-attention blocks
+    Decoder: Conv1x1 + PixelShuffle for reconstruction
+    Masking: HR random masking, LR center masking
+    Args:
+        model_name: HuggingFace model identifier
+        mask_ratio_hr: Masking ratio for HR branch
+        mask_ratio_lr: Masking ratio for LR branch
+        patch_size: Patch size for masking
+        encoder_stride: Encoder stride for decoder
+        xa_heads: Number of cross-attention heads per stage
+    """
+    def __init__(self, model_name: str = "openmmlab/upernet-swin-base",
+                 mask_ratio_hr: float = 0.75, mask_ratio_lr: float = 0.5,
+                 patch_size: int = 4, encoder_stride: int = 32,
+                 xa_heads: Tuple[int, int, int, int] = (8, 8, 8, 8)):
+        super().__init__()
+        self.mask_ratio_hr = mask_ratio_hr
+        self.mask_ratio_lr = mask_ratio_lr
+        self.patch_size = patch_size
+        self.encoder_stride = encoder_stride
+        # Load two UPerNet (Swin) backbones
+        model_hr = UperNetForSemanticSegmentation.from_pretrained(
+            model_name, ignore_mismatched_sizes=True
+        )
+        model_lr = UperNetForSemanticSegmentation.from_pretrained(
+            model_name, ignore_mismatched_sizes=True
+        )
+        self.embeddings_hr = model_hr.backbone.embeddings
+        self.encoder_layers_hr = model_hr.backbone.encoder.layers
+        self.hidden_states_norms_hr = model_hr.backbone.hidden_states_norms
+        self.embeddings_lr = model_lr.backbone.embeddings
+        self.encoder_layers_lr = model_lr.backbone.encoder.layers
+        self.hidden_states_norms_lr = model_lr.backbone.hidden_states_norms
+        # Cross-attention blocks with explicit Swin-Base dims
+        dims = [128, 256, 512, 1024]
+        self.cross_attn_blocks = nn.ModuleList([
+            CrossAttentionBlock(d, d, num_heads=h) for d, h in zip(dims, xa_heads)
+        ])
+        # Learnable mask tokens
+        self.mask_token_hr = nn.Parameter(torch.zeros(1, dims[0]))
+        self.mask_token_lr = nn.Parameter(torch.zeros(1, dims[0]))
+        # SimMIM decoder: Conv1×1 → PixelShuffle(stride)
+        self.decoder_conv = None  # lazy init after we know C_last
+        self.decoder_shuffle = nn.PixelShuffle(self.encoder_stride)
+        # Store masks for visualization
+        self.last_mask_hr = None
+        self.last_mask_lr = None
+    def _encode(self, x_hr: torch.Tensor, x_lr: torch.Tensor):
+        """Encode with masking and return reconstruction targets."""
+        B, C, H, W = x_hr.shape
+        target_img = x_hr
+        target_lr = x_lr
+        # Patch embeddings
+        x_hr_seq, _ = self.embeddings_hr(x_hr)  # (B, N_hr, C1)
+        x_lr_seq, _ = self.embeddings_lr(x_lr)  # (B, N_lr, C1)
+        # Masking
+        x_hr_seq, mask_hr, _ = random_masking_with_tokens(
+            x_hr_seq, self.mask_ratio_hr, self.mask_token_hr
+        )
+        x_lr_seq, mask_lr, _ = center_masking_with_tokens(
+            x_lr_seq, self.mask_token_lr, mask_ratio=self.mask_ratio_lr
+        )
+        # Initial spatial dims
+        H_hr = W_hr = int(math.sqrt(x_hr_seq.shape[1]))
+        H_lr = W_lr = int(math.sqrt(x_lr_seq.shape[1]))
+        dims_hr = (H_hr, W_hr)
+        dims_lr = (H_lr, W_lr)
+        # Walk encoder stages with cross attention at each stage
+        for idx, (stage_hr, stage_lr, ca) in enumerate(zip(
+            self.encoder_layers_hr, self.encoder_layers_lr, self.cross_attn_blocks
+        )):
+            # HR blocks
+            for block in stage_hr.blocks:
+                x_hr_seq = block(x_hr_seq, dims_hr)
+                if isinstance(x_hr_seq, tuple):
+                    x_hr_seq = x_hr_seq[0]
+            # LR blocks
+            for block in stage_lr.blocks:
+                x_lr_seq = block(x_lr_seq, dims_lr)
+                if isinstance(x_lr_seq, tuple):
+                    x_lr_seq = x_lr_seq[0]
+            # Norms
+            x_hr_seq = self.hidden_states_norms_hr[f"stage{idx+1}"](x_hr_seq)
+            x_lr_seq = self.hidden_states_norms_lr[f"stage{idx+1}"](x_lr_seq)
+            # Maps
+            B_, N_hr_, C_hr_ = x_hr_seq.shape
+            B_, N_lr_, C_lr_ = x_lr_seq.shape
+            Hh, Wh = dims_hr
+            Hl, Wl = dims_lr
+            feat_hr = x_hr_seq.transpose(1, 2).contiguous().view(B_, C_hr_, Hh, Wh)
+            feat_lr = x_lr_seq.transpose(1, 2).contiguous().view(B_, C_lr_, Hl, Wl)
+            # Cross-fuse HR <- LR
+            fused_hr = ca(feat_hr, feat_lr)
+            x_hr_seq = fused_hr.flatten(2).transpose(1, 2).contiguous()
+            # Downsample to next stage
+            if stage_hr.downsample is not None:
+                x_hr_seq = stage_hr.downsample(x_hr_seq, dims_hr)
+                dims_hr = (dims_hr[0] // 2, dims_hr[1] // 2)
+            if stage_lr.downsample is not None:
+                x_lr_seq = stage_lr.downsample(x_lr_seq, dims_lr)
+                dims_lr = (dims_lr[0] // 2, dims_lr[1] // 2)
+        # Last-stage feature map z (B, C_last, H/stride, W/stride)
+        Hs, Ws = dims_hr
+        C_last = x_hr_seq.shape[-1]
+        z = x_hr_seq.transpose(1, 2).contiguous().view(B, C_last, Hs, Ws)
+        # Lazy init decoder conv
+        if self.decoder_conv is None:
+            self.decoder_conv = nn.Conv2d(
+                C_last, (self.encoder_stride ** 2) * 3, kernel_size=1
+            ).to(z.device)
+        # Reconstruction
+        x_rec = self.decoder_shuffle(self.decoder_conv(z))  # (B,3,H,W)
+        # Convert patch masks to pixel masks
+        Mh = int(math.sqrt(mask_hr.shape[1]))
+        mask_patch_hr = mask_hr.view(B, Mh, Mh)
+        mask_pix_hr = mask_patch_hr.repeat_interleave(
+            self.patch_size, 1
+        ).repeat_interleave(self.patch_size, 2).unsqueeze(1).contiguous()
+        Ml = int(math.sqrt(mask_lr.shape[1]))
+        mask_patch_lr = mask_lr.view(B, Ml, Ml)
+        mask_pix_lr = mask_patch_lr.repeat_interleave(
+            self.patch_size, 1
+        ).repeat_interleave(self.patch_size, 2).unsqueeze(1).contiguous()
+        self.last_mask_hr = mask_patch_hr
+        self.last_mask_lr = mask_patch_lr
+        return x_rec, target_img, mask_pix_hr, target_lr, mask_pix_lr
+    def forward(self, x_hr: torch.Tensor, x_lr: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass for SSL training.
+        Returns reconstruction loss on masked pixels only.
+        """
+        x_rec, target_img, mask_pix, _, _ = self._encode(x_hr, x_lr)
+        loss_recon = F.l1_loss(target_img, x_rec, reduction='none')
+        loss = (loss_recon * mask_pix).sum() / (mask_pix.sum() + 1e-6) / target_img.shape[1]
+        return loss
+    @torch.no_grad()
+    def forward_outputs(self, x_hr: torch.Tensor, x_lr: torch.Tensor):
+        """Forward pass returning all outputs for visualization."""
+        x_rec, target_img, mask_pix_hr, target_lr, mask_pix_lr = self._encode(x_hr, x_lr)
+        return x_rec, target_img, mask_pix_hr, target_lr, mask_pix_lr

model/CASWiT_upernet.py ADDED Viewed

	@@ -0,0 +1,250 @@

+"""
+CASWiT: Context-Aware Swin Transformer for Ultra-High Resolution Semantic Segmentation
+This module implements the main CASWiT model architecture with dual-branch
+high-resolution and low-resolution processing with cross-attention fusion.
+"""
+import math
+from typing import Dict
+import torch
+import torch.nn as nn
+from transformers import UperNetForSemanticSegmentation
+from transformers.utils import logging as hf_logging
+hf_logging.set_verbosity_error()
+hf_logging.disable_progress_bar()
+class DropPath(nn.Module):
+    """Drop path (stochastic depth) regularization module."""
+    def __init__(self, drop_prob: float = 0.0):
+        super().__init__()
+        self.drop_prob = float(drop_prob)
+    def forward(self, x):
+        if self.drop_prob == 0.0 or (not self.training):
+            return x
+        keep = 1.0 - self.drop_prob
+        shape = (x.shape[0],) + (1,) * (x.ndim - 1)
+        mask = x.new_empty(shape).bernoulli_(keep).div_(keep)
+        return x * mask
+class CrossFusionBlock(nn.Module):
+    """
+    Cross-attention fusion block that enables HR features to attend to LR features.
+    Implements pre-norm cross-attention (Q=HR, K/V=LR).
+    Args:
+        C_hr: Channel dimension of HR features
+        C_lr: Channel dimension of LR features
+        num_heads: Number of attention heads
+        mlp_ratio: MLP expansion ratio
+        drop: Dropout rate
+        drop_path: Drop path rate
+    """
+    def __init__(self, C_hr: int, C_lr: int, num_heads: int = 8,
+                 mlp_ratio: float = 4.0, drop: float = 0.0, drop_path: float = 0.1):
+        super().__init__()
+        self.norm_q = nn.LayerNorm(C_hr)
+        self.norm_kv = nn.LayerNorm(C_lr)
+        self.attn = nn.MultiheadAttention(
+            embed_dim=C_hr, num_heads=num_heads, kdim=C_lr, vdim=C_lr,
+            dropout=drop, batch_first=True
+        )
+        hidden = int(C_hr * mlp_ratio)
+        self.mlp = nn.Sequential(
+            nn.LayerNorm(C_hr),
+            nn.Linear(C_hr, hidden),
+            nn.GELU(),
+            nn.Linear(hidden, C_hr),
+        )
+    def forward(self, x_hr: torch.Tensor, x_lr: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass through cross-attention fusion block.
+        Args:
+            x_hr: HR features [B, C_hr, H_hr, W_hr]
+            x_lr: LR features [B, C_lr, H_lr, W_lr]
+        Returns:
+            Fused HR features [B, C_hr, H_hr, W_hr]
+        """
+        B, C_hr, H_hr, W_hr = x_hr.shape
+        _, C_lr, H_lr, W_lr = x_lr.shape
+        # Flatten to sequences
+        q  = x_hr.flatten(2).transpose(1, 2)    # [B, N_hr, C_hr]
+        kv = x_lr.flatten(2).transpose(1, 2)    # [B, N_lr, C_lr]
+        # Pre-norm
+        qn  = self.norm_q(q)
+        kvn = self.norm_kv(kv)
+        attn_out, _ = self.attn(qn, kvn, kvn)  # [B, N_hr, C_hr]
+        # Residual connection + MLP
+        y = q + attn_out
+        y = y + self.mlp(y)
+        return y.transpose(1, 2).view(B, C_hr, H_hr, W_hr)
+class CASWiT(nn.Module):
+    """
+    CASWiT: Context-Aware Swin Transformer for Ultra-High Resolution Semantic Segmentation.
+    Dual-branch architecture with:
+    - HR branch: Processes high-resolution crops
+    - LR branch: Processes low-resolution context
+    - Cross-attention fusion at each encoder stage
+    Args:
+        num_head_xa: Number of cross-attention heads
+        num_classes: Number of segmentation classes
+        model_name: HuggingFace model identifier for UPerNet-Swin
+        mlp_ratio: MLP expansion ratio in fusion blocks
+        drop_path: Drop path rate
+    """
+    def __init__(self, num_head_xa: int = 1, num_classes: int = 12,
+                 model_name: str = "openmmlab/upernet-swin-tiny",
+                 mlp_ratio: float = 4.0, drop_path: float = 0.1):
+        super().__init__()
+        # Load two UPerNet backbones (HR and LR branches)
+        model_hr = UperNetForSemanticSegmentation.from_pretrained(
+            model_name, num_labels=num_classes, ignore_mismatched_sizes=True
+        )
+        model_lr = UperNetForSemanticSegmentation.from_pretrained(
+            model_name, num_labels=num_classes, ignore_mismatched_sizes=True
+        )
+        # Extract HR branch components
+        self.embeddings_hr = model_hr.backbone.embeddings
+        self.encoder_layers_hr = model_hr.backbone.encoder.layers
+        self.hidden_states_norms_hr = model_hr.backbone.hidden_states_norms
+        self.decoder = model_hr.decode_head
+        # Extract LR branch components
+        self.embeddings_lr = model_lr.backbone.embeddings
+        self.encoder_layers_lr = model_lr.backbone.encoder.layers
+        self.hidden_states_norms_lr = model_lr.backbone.hidden_states_norms
+        self.decoder_lr = model_lr.decode_head
+        # Cross-attention blocks at each stage
+        # Dimensions: tiny:[96, 192, 384, 768] base:[128, 256, 512, 1024] large:[192, 384, 768, 1536]
+        dims_map = {
+            "tiny": [96, 192, 384, 768],
+            "base": [128, 256, 512, 1024],
+            "large": [192, 384, 768, 1536]
+        }
+        # Infer dimensions from model name
+        if "tiny" in model_name.lower():
+            dims = dims_map["tiny"]
+        elif "large" in model_name.lower():
+            dims = dims_map["large"]
+        else:
+            dims = dims_map["base"]  # default to base
+        self.cross_attn_blocks = nn.ModuleList([
+            CrossFusionBlock(dim, dim, num_heads=num_head_xa,
+                           mlp_ratio=mlp_ratio, drop=0.0, drop_path=drop_path)
+            for dim in dims
+        ])
+    def forward(self, x_hr: torch.Tensor, x_lr: torch.Tensor) -> Dict[str, torch.Tensor]:
+        """
+        Forward pass through CASWiT model.
+        Args:
+            x_hr: HR input images [B, 3, H_hr, W_hr]
+            x_lr: LR input images [B, 3, H_lr, W_lr]
+        Returns:
+            Dictionary with 'logits_hr' and 'logits_lr' segmentation logits
+        """
+        B = x_hr.size(0)
+        # Patch embeddings
+        x_hr_seq, _ = self.embeddings_hr(x_hr)
+        x_lr_seq, _ = self.embeddings_lr(x_lr)
+        N_hr, C_hr = x_hr_seq.shape[1], x_hr_seq.shape[2]
+        N_lr, C_lr = x_lr_seq.shape[1], x_lr_seq.shape[2]
+        H_hr = W_hr = int(math.sqrt(N_hr))
+        H_lr = W_lr = int(math.sqrt(N_lr))
+        dims_hr = (H_hr, W_hr)
+        dims_lr = (H_lr, W_lr)
+        features_hr: Dict[str, torch.Tensor] = {}
+        features_lr: Dict[str, torch.Tensor] = {}
+        # Process through encoder stages with cross-attention fusion
+        for idx, (stage_hr, stage_lr, ca) in enumerate(zip(
+            self.encoder_layers_hr, self.encoder_layers_lr, self.cross_attn_blocks
+        )):
+            # HR branch blocks
+            for block in stage_hr.blocks:
+                x_hr_seq = block(x_hr_seq, dims_hr)
+                if isinstance(x_hr_seq, tuple):
+                    x_hr_seq = x_hr_seq[0]
+            # LR branch blocks
+            for block in stage_lr.blocks:
+                x_lr_seq = block(x_lr_seq, dims_lr)
+                if isinstance(x_lr_seq, tuple):
+                    x_lr_seq = x_lr_seq[0]
+            # Layer normalization
+            x_hr_seq = self.hidden_states_norms_hr[f"stage{idx+1}"](x_hr_seq)
+            x_lr_seq = self.hidden_states_norms_lr[f"stage{idx+1}"](x_lr_seq)
+            H_hr, W_hr = dims_hr
+            H_lr, W_lr = dims_lr
+            C_hr = x_hr_seq.shape[-1]
+            C_lr = x_lr_seq.shape[-1]
+            # Reshape to spatial format
+            feat_hr = x_hr_seq.transpose(1, 2).contiguous().view(B, C_hr, H_hr, W_hr)
+            feat_lr = x_lr_seq.transpose(1, 2).contiguous().view(B, C_lr, H_lr, W_lr)
+            fused_hr = ca(feat_hr, feat_lr)
+            fused_hr_seq = fused_hr.flatten(2).transpose(1, 2).contiguous()
+            # Downsample if stage has it
+            if stage_hr.downsample is not None:
+                fused_hr_seq = stage_hr.downsample(fused_hr_seq, dims_hr)
+                dims_hr = (dims_hr[0] // 2, dims_hr[1] // 2)
+            if stage_lr.downsample is not None:
+                x_lr_seq = stage_lr.downsample(x_lr_seq, dims_lr)
+                dims_lr = (dims_lr[0] // 2, dims_lr[1] // 2)
+            features_hr[f"stage{idx+1}"] = fused_hr
+            features_lr[f"stage{idx+1}"] = feat_lr
+            x_hr_seq = fused_hr_seq
+        # Decode HR features
+        features_tuple = (
+            features_hr["stage1"],
+            features_hr["stage2"],
+            features_hr["stage3"],
+            features_hr["stage4"],
+        )
+        logits = self.decoder(features_tuple)
+        # Decode LR features (for auxiliary supervision)
+        features_tuple_lr = (
+            features_lr["stage1"],
+            features_lr["stage2"],
+            features_lr["stage3"],
+            features_lr["stage4"],
+        )
+        logits_lr = self.decoder_lr(features_tuple_lr)
+        return {"logits_hr": logits, "logits_lr": logits_lr}

model/build_model.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from __future__ import annotations
+from typing import Any
+from model.CASWiT_upernet import CASWiT as CASWiT_UperNet
+from model.CASWiT_segformer import CASWiT as CASWiT_SegFormer
+from model.CASWiT_m2f import CASWiT as CASWiT_Mask2Former
+from model.CASWiT_fusion_last_stage_add import CASWiT as CASWiT_FusionLastStageAdd
+from model.CASWiT_ssl import CASWiT_SSL
+def _get(cfg: Any, name: str, default: Any = None) -> Any:
+    return getattr(cfg, name, default)
+def build_model(cfg: Any):
+    head = _get(cfg, "model_head", None) or _get(cfg, "head", None) or "upernet"
+    head = str(head).lower()
+    common = dict(
+        num_head_xa=int(_get(cfg, "cross_attention_heads")),
+        num_classes=int(_get(cfg, "num_classes")),
+        model_name=str(_get(cfg, "model_name")),
+        mlp_ratio=float(_get(cfg, "fusion_mlp_ratio")),
+        drop_path=float(_get(cfg, "fusion_drop_path")),
+    )
+    if head in ("upernet", "caswit", "default"):
+        return CASWiT_UperNet(**common)
+    if head in ("segformer",):
+        return CASWiT_SegFormer(**common)
+    if head in ("mask2former", "m2f"):
+        return CASWiT_Mask2Former(**common)
+    if head in ("fusion_last_stage_add", "last_stage_add"):
+        return CASWiT_FusionLastStageAdd(**common)
+    if head in ("ssl", "caswit_ssl"):
+        return CASWiT_SSL(model_name=str(_get(cfg, "model_name")))
+    raise ValueError(f"Unknown model head: {head}. Available: upernet, segformer, mask2former, fusion_last_stage_add, ssl")