Update all files for SkySensepp

Browse files

Files changed (1) hide show

sky_sensepp_impl/compat.py +865 -0

sky_sensepp_impl/compat.py ADDED Viewed

	@@ -0,0 +1,865 @@

+"""Pure PyTorch replacements for mmcv, mmseg, and mmcls utilities.
+This module provides drop-in replacements so the codebase can run without
+the mm* ecosystem installed.  Every public symbol mirrors the original API
+as used throughout the repository.
+"""
+import collections.abc
+import logging
+import math
+import warnings
+from functools import partial
+from typing import Optional, Sequence, Tuple, Union
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.modules.batchnorm import _BatchNorm
+from torch.nn.modules.utils import _pair as to_2tuple
+# ---------------------------------------------------------------------------
+# Logging helper (replaces get_root_logger from mmseg/mmcls)
+# ---------------------------------------------------------------------------
+def get_root_logger(log_file=None, log_level=logging.INFO):
+    """Get the root logger with a StreamHandler."""
+    logger = logging.getLogger()
+    if not logger.handlers:
+        handler = logging.StreamHandler()
+        handler.setLevel(log_level)
+        logger.addHandler(handler)
+    logger.setLevel(log_level)
+    return logger
+# ---------------------------------------------------------------------------
+# DropPath (Stochastic Depth)
+# ---------------------------------------------------------------------------
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample."""
+    def __init__(self, drop_prob: float = 0.0):
+        super().__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.drop_prob == 0.0 or not self.training:
+            return x
+        keep_prob = 1.0 - self.drop_prob
+        shape = (x.shape[0],) + (1,) * (x.ndim - 1)
+        random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+        random_tensor.div_(keep_prob)
+        return x * random_tensor
+    def extra_repr(self) -> str:
+        return f"drop_prob={self.drop_prob:.3f}"
+def _build_dropout(cfg):
+    """Build a dropout layer from *cfg* dict.
+    Supports ``dict(type='DropPath', drop_prob=X)`` and plain
+    ``nn.Dropout``-style configs.  Returns ``nn.Identity`` when *cfg* is
+    ``None`` or the drop probability is zero.
+    """
+    if cfg is None:
+        return nn.Identity()
+    cfg = cfg.copy()
+    tp = cfg.pop("type", "Dropout")
+    if tp == "DropPath":
+        return DropPath(drop_prob=cfg.get("drop_prob", 0.0))
+    if tp == "Dropout":
+        return nn.Dropout(p=cfg.get("p", cfg.get("drop_prob", 0.0)))
+    raise ValueError(f"Unsupported dropout type: {tp}")
+# ---------------------------------------------------------------------------
+# build_norm_layer
+# ---------------------------------------------------------------------------
+_NORM_ABBR = {
+    "LN": "ln",
+    "BN": "bn",
+    "SyncBN": "bn",
+    "GN": "gn",
+    "IN": "in",
+}
+def build_norm_layer(cfg: dict, num_features: int, postfix: Union[int, str] = 0):
+    """Build a normalization layer from a config dict.
+    Returns:
+        tuple[str, nn.Module]: ``(name, layer)``
+    The *name* is e.g. ``'ln1'`` for LayerNorm with postfix 1.
+    """
+    cfg = cfg.copy()
+    tp = cfg.pop("type")
+    abbr = _NORM_ABBR.get(tp, tp.lower())
+    name = f"{abbr}{postfix}"
+    if tp in ("LN", "LayerNorm"):
+        layer = nn.LayerNorm(num_features, **cfg)
+    elif tp in ("BN", "BN2d", "BatchNorm", "BatchNorm2d"):
+        layer = nn.BatchNorm2d(num_features, **cfg)
+    elif tp in ("SyncBN", "SyncBatchNorm"):
+        layer = nn.SyncBatchNorm(num_features, **cfg)
+    elif tp in ("GN", "GroupNorm"):
+        num_groups = cfg.pop("num_groups", 32)
+        layer = nn.GroupNorm(num_groups, num_features, **cfg)
+    elif tp in ("IN", "InstanceNorm", "InstanceNorm2d"):
+        layer = nn.InstanceNorm2d(num_features, **cfg)
+    else:
+        raise ValueError(f"Unsupported norm type: {tp}")
+    return name, layer
+# ---------------------------------------------------------------------------
+# Weight initialisation helpers
+# ---------------------------------------------------------------------------
+trunc_normal_ = nn.init.trunc_normal_
+def constant_init(module: nn.Module, val: float, bias: float = 0.0):
+    if hasattr(module, "weight") and module.weight is not None:
+        nn.init.constant_(module.weight, val)
+    if hasattr(module, "bias") and module.bias is not None:
+        nn.init.constant_(module.bias, bias)
+def kaiming_init(module: nn.Module, mode: str = "fan_in", bias: float = 0.0):
+    nn.init.kaiming_normal_(module.weight, mode=mode, nonlinearity="relu")
+    if hasattr(module, "bias") and module.bias is not None:
+        nn.init.constant_(module.bias, bias)
+# ---------------------------------------------------------------------------
+# BaseModule  (drop-in for mmcv.runner.BaseModule)
+# ---------------------------------------------------------------------------
+class BaseModule(nn.Module):
+    """Minimal replacement for ``mmcv.runner.BaseModule``.
+    Accepts an optional ``init_cfg`` so that the same constructor
+    signatures work.  ``init_weights`` is provided as a no-op that
+    subclasses can override.
+    """
+    def __init__(self, init_cfg=None):
+        super().__init__()
+        self.init_cfg = init_cfg
+    def init_weights(self):
+        pass
+# Alias so ``from mmcv.runner.base_module import ModuleList`` works
+# after rewriting the import to ``from .compat import ModuleList``.
+ModuleList = nn.ModuleList
+# ---------------------------------------------------------------------------
+# CheckpointLoader & load_state_dict
+# ---------------------------------------------------------------------------
+class CheckpointLoader:
+    @staticmethod
+    def load_checkpoint(path, logger=None, map_location="cpu"):
+        if logger:
+            logger.info(f"Loading checkpoint from {path}")
+        return torch.load(path, map_location=map_location)
+def load_state_dict(model, state_dict, strict=False, logger=None):
+    unexpected = []
+    missing = []
+    result = model.load_state_dict(state_dict, strict=strict)
+    if hasattr(result, "missing_keys"):
+        missing = result.missing_keys
+    if hasattr(result, "unexpected_keys"):
+        unexpected = result.unexpected_keys
+    if logger:
+        if missing:
+            logger.warning(f"Missing keys: {missing}")
+        if unexpected:
+            logger.warning(f"Unexpected keys: {unexpected}")
+# ---------------------------------------------------------------------------
+# auto_fp16 (no-op decorator)
+# ---------------------------------------------------------------------------
+def auto_fp16(apply_to=None, out_fp32=False):
+    """No-op replacement for ``mmcv.runner.auto_fp16``."""
+    def wrapper(old_func):
+        return old_func
+    return wrapper
+# ---------------------------------------------------------------------------
+# resize  (replacement for mmseg.ops.resize)
+# ---------------------------------------------------------------------------
+def resize(input, size=None, scale_factor=None, mode="nearest",
+           align_corners=None, warning=True):
+    return F.interpolate(input, size=size, scale_factor=scale_factor,
+                         mode=mode, align_corners=align_corners)
+# ---------------------------------------------------------------------------
+# FFN  (Feed-Forward Network used in transformer blocks)
+# ---------------------------------------------------------------------------
+class FFN(nn.Module):
+    """Feed-Forward Network compatible with the mmcv API.
+    Parameters
+    ----------
+    embed_dims : int
+    feedforward_channels : int
+    num_fcs : int  (default 2)
+    ffn_drop : float  (default 0.)
+    dropout_layer : dict | None  (``dict(type='DropPath', drop_prob=X)``)
+    act_cfg : dict  (default ``dict(type='GELU')``)
+    add_identity : bool  (default True)
+    """
+    _ACT = {
+        "GELU": nn.GELU,
+        "ReLU": partial(nn.ReLU, inplace=True),
+        "SiLU": nn.SiLU,
+    }
+    def __init__(self, embed_dims, feedforward_channels, num_fcs=2,
+                 ffn_drop=0., dropout_layer=None, act_cfg=None,
+                 add_identity=True, **kwargs):
+        super().__init__()
+        if act_cfg is None:
+            act_cfg = dict(type="GELU")
+        act_cls = self._ACT.get(act_cfg.get("type", "GELU"), nn.GELU)
+        layers = []
+        in_dim = embed_dims
+        for i in range(num_fcs - 1):
+            layers.append(nn.Linear(in_dim, feedforward_channels))
+            layers.append(act_cls())
+            layers.append(nn.Dropout(ffn_drop))
+            in_dim = feedforward_channels
+        layers.append(nn.Linear(feedforward_channels, embed_dims))
+        layers.append(nn.Dropout(ffn_drop))
+        self.layers = nn.Sequential(*layers)
+        self.dropout_layer = _build_dropout(dropout_layer)
+        self.add_identity = add_identity
+    def forward(self, x, identity=None):
+        out = self.layers(x)
+        if not isinstance(self.dropout_layer, nn.Identity):
+            out = self.dropout_layer(out)
+        if self.add_identity:
+            if identity is None:
+                identity = x
+            out = out + identity
+        return out
+# ---------------------------------------------------------------------------
+# MultiheadAttention  (mmcv-compatible wrapper)
+# ---------------------------------------------------------------------------
+class MultiheadAttention(nn.Module):
+    """Multi-head attention compatible with mmcv's API.
+    Supports ``forward(query, key=None, value=None, identity=None)``.
+    When *key*/*value* are ``None`` they default to *query* (self-attention).
+    An identity residual is added by default (identity defaults to *query*).
+    """
+    def __init__(self, embed_dims, num_heads, attn_drop=0., proj_drop=0.,
+                 batch_first=True, bias=True, dropout_layer=None, **kwargs):
+        super().__init__()
+        self.embed_dims = embed_dims
+        self.num_heads = num_heads
+        self.batch_first = batch_first
+        self.attn = nn.MultiheadAttention(
+            embed_dim=embed_dims,
+            num_heads=num_heads,
+            dropout=attn_drop,
+            bias=bias,
+            batch_first=batch_first,
+        )
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.dropout_layer = _build_dropout(dropout_layer)
+    def forward(self, query, key=None, value=None, identity=None,
+                attn_mask=None, key_padding_mask=None, **kwargs):
+        if key is None:
+            key = query
+        if value is None:
+            value = key
+        if identity is None:
+            identity = query
+        out, _ = self.attn(query, key, value, attn_mask=attn_mask,
+                           key_padding_mask=key_padding_mask)
+        out = self.proj_drop(out)
+        if not isinstance(self.dropout_layer, nn.Identity):
+            out = self.dropout_layer(out)
+        return out + identity
+# ---------------------------------------------------------------------------
+# ConvModule  (Conv + optional Norm + optional Activation)
+# ---------------------------------------------------------------------------
+class ConvModule(nn.Module):
+    """Conv2d + optional normalization + optional activation."""
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, dilation=1, groups=1, bias="auto",
+                 conv_cfg=None, norm_cfg=None, act_cfg=None,
+                 inplace=True, order=("conv", "norm", "act"), **kwargs):
+        super().__init__()
+        if act_cfg is None:
+            act_cfg = dict(type="ReLU")
+        # bias defaults to True when no norm, False otherwise
+        if bias == "auto":
+            bias = norm_cfg is None
+        self.conv = nn.Conv2d(
+            in_channels, out_channels, kernel_size,
+            stride=stride, padding=padding, dilation=dilation,
+            groups=groups, bias=bias,
+        )
+        self.norm = None
+        if norm_cfg is not None:
+            _, self.norm = build_norm_layer(norm_cfg, out_channels)
+        self.act = None
+        if act_cfg is not None:
+            act_type = act_cfg.get("type", "ReLU")
+            if act_type == "ReLU":
+                self.act = nn.ReLU(inplace=inplace)
+            elif act_type == "GELU":
+                self.act = nn.GELU()
+            elif act_type == "SiLU":
+                self.act = nn.SiLU(inplace=inplace)
+            elif act_type == "LeakyReLU":
+                self.act = nn.LeakyReLU(
+                    negative_slope=act_cfg.get("negative_slope", 0.01),
+                    inplace=inplace)
+            else:
+                raise ValueError(f"Unsupported activation: {act_type}")
+        self.order = order
+    def forward(self, x):
+        for layer_name in self.order:
+            if layer_name == "conv":
+                x = self.conv(x)
+            elif layer_name == "norm" and self.norm is not None:
+                x = self.norm(x)
+            elif layer_name == "act" and self.act is not None:
+                x = self.act(x)
+        return x
+# ---------------------------------------------------------------------------
+# PatchEmbed  (Patch Embedding via Conv2d)
+# ---------------------------------------------------------------------------
+class PatchEmbed(nn.Module):
+    """Image to patch embedding.
+    Returns ``(tokens, hw_shape)`` where tokens is ``(B, H*W, C)``
+    and ``hw_shape`` is ``(H, W)`` of the grid.
+    ``padding='corner'`` means zero padding (padding=0).
+    When *input_size* is given, ``self.init_out_size`` is set.
+    """
+    def __init__(self, in_channels=3, embed_dims=768, conv_type="Conv2d",
+                 kernel_size=16, stride=16, padding="corner",
+                 dilation=1, norm_cfg=None, input_size=None,
+                 init_cfg=None, **kwargs):
+        super().__init__()
+        if isinstance(padding, str):
+            # 'corner' == no extra padding
+            padding = 0
+        if isinstance(kernel_size, int):
+            kernel_size = (kernel_size, kernel_size)
+        if isinstance(stride, int):
+            stride = (stride, stride)
+        if isinstance(padding, int):
+            padding = (padding, padding)
+        if isinstance(dilation, int):
+            dilation = (dilation, dilation)
+        self.projection = nn.Conv2d(
+            in_channels, embed_dims,
+            kernel_size=kernel_size, stride=stride,
+            padding=padding, dilation=dilation,
+        )
+        self.norm = None
+        if norm_cfg is not None:
+            _, self.norm = build_norm_layer(norm_cfg, embed_dims)
+        # Pre-compute output spatial size when input_size is known.
+        self.init_out_size = None
+        if input_size is not None:
+            if isinstance(input_size, int):
+                input_size = (input_size, input_size)
+            h_out = (input_size[0] + 2 * padding[0] - dilation[0] *
+                     (kernel_size[0] - 1) - 1) // stride[0] + 1
+            w_out = (input_size[1] + 2 * padding[1] - dilation[1] *
+                     (kernel_size[1] - 1) - 1) // stride[1] + 1
+            self.init_out_size = (h_out, w_out)
+    def forward(self, x):
+        x = self.projection(x)                   # (B, C, H, W)
+        hw_shape = (x.shape[2], x.shape[3])
+        x = x.flatten(2).transpose(1, 2)         # (B, H*W, C)
+        if self.norm is not None:
+            x = self.norm(x)
+        return x, hw_shape
+# ---------------------------------------------------------------------------
+# resize_pos_embed  (from mmcls)
+# ---------------------------------------------------------------------------
+def resize_pos_embed(pos_embed, src_shape, dst_shape, mode="bicubic",
+                     num_extra_tokens=0):
+    """Resize position embeddings via interpolation.
+    Parameters
+    ----------
+    pos_embed : Tensor  (1, L, C) or (1, extra+H*W, C)
+    src_shape : tuple  (H_src, W_src)
+    dst_shape : tuple  (H_dst, W_dst)
+    mode : str
+    num_extra_tokens : int  (e.g. 1 for CLS token)
+    """
+    if src_shape == dst_shape:
+        return pos_embed
+    extra_tokens = pos_embed[:, :num_extra_tokens]
+    pos_tokens = pos_embed[:, num_extra_tokens:]
+    B, L, C = pos_tokens.shape
+    src_h, src_w = src_shape
+    pos_tokens = pos_tokens.reshape(B, src_h, src_w, C).permute(0, 3, 1, 2)
+    dst_h, dst_w = dst_shape
+    pos_tokens = F.interpolate(
+        pos_tokens.float(), size=(dst_h, dst_w), mode=mode,
+        align_corners=False if mode == "bicubic" else None,
+    )
+    pos_tokens = pos_tokens.permute(0, 2, 3, 1).reshape(B, -1, C)
+    if num_extra_tokens > 0:
+        pos_tokens = torch.cat([extra_tokens, pos_tokens], dim=1)
+    return pos_tokens
+# ---------------------------------------------------------------------------
+# PatchMerging  (from mmcls – downsamples by merging 2×2 patches)
+# ---------------------------------------------------------------------------
+class PatchMerging(nn.Module):
+    """Merge 2×2 neighbouring patches to downsample.
+    Input : (B, H*W, C) + input_size (H, W)
+    Output: (B, H/2*W/2, out_channels) + output_size (H/2, W/2)
+    """
+    def __init__(self, in_channels, out_channels,
+                 norm_cfg=None, is_post_norm=True, **kwargs):
+        super().__init__()
+        if norm_cfg is None:
+            norm_cfg = dict(type="LN")
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.is_post_norm = is_post_norm
+        self.reduction = nn.Linear(4 * in_channels, out_channels, bias=False)
+        if is_post_norm:
+            _, self.norm = build_norm_layer(norm_cfg, out_channels)
+        else:
+            _, self.norm = build_norm_layer(norm_cfg, 4 * in_channels)
+    def forward(self, x, input_size):
+        H, W = input_size
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+        x = x.view(B, H, W, C)
+        # Pad if H or W is odd
+        pad_h = H % 2
+        pad_w = W % 2
+        if pad_h or pad_w:
+            x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h))
+        x0 = x[:, 0::2, 0::2, :]  # (B, H/2, W/2, C)
+        x1 = x[:, 1::2, 0::2, :]
+        x2 = x[:, 0::2, 1::2, :]
+        x3 = x[:, 1::2, 1::2, :]
+        x = torch.cat([x0, x1, x2, x3], dim=-1)  # (B, H/2, W/2, 4*C)
+        out_h = (H + 1) // 2
+        out_w = (W + 1) // 2
+        x = x.view(B, out_h * out_w, 4 * C)
+        if not self.is_post_norm:
+            x = self.norm(x)
+        x = self.reduction(x)
+        if self.is_post_norm:
+            x = self.norm(x)
+        return x, (out_h, out_w)
+# ---------------------------------------------------------------------------
+# WindowMSAV2  (Window-based Multi-head Self-Attention V2)
+# ---------------------------------------------------------------------------
+class WindowMSAV2(nn.Module):
+    """Window-based Multi-head Self-Attention V2 with log-spaced continuous
+    position bias (cosine attention + log-CPB).
+    Parameters
+    ----------
+    embed_dims : int
+    num_heads : int
+    window_size : tuple[int, int]
+    pretrained_window_size : tuple[int, int]  (default (0, 0))
+    """
+    def __init__(self, embed_dims, num_heads, window_size,
+                 pretrained_window_size=(0, 0), qkv_bias=True,
+                 attn_drop=0., proj_drop=0., **kwargs):
+        super().__init__()
+        self.embed_dims = embed_dims
+        self.num_heads = num_heads
+        if isinstance(window_size, int):
+            window_size = (window_size, window_size)
+        self.window_size = window_size
+        if isinstance(pretrained_window_size, int):
+            pretrained_window_size = (pretrained_window_size,
+                                      pretrained_window_size)
+        self.pretrained_window_size = pretrained_window_size
+        self.logit_scale = nn.Parameter(
+            torch.log(10.0 * torch.ones((num_heads, 1, 1))))
+        # -- Continuous Position Bias MLP (log-CPB) --
+        self.cpb_mlp = nn.Sequential(
+            nn.Linear(2, 512, bias=True),
+            nn.ReLU(inplace=True),
+            nn.Linear(512, num_heads, bias=False),
+        )
+        # Build relative coords table
+        self._build_relative_coords_table()
+        # Build relative position index
+        self._build_relative_position_index()
+        self.qkv = nn.Linear(embed_dims, embed_dims * 3, bias=False)
+        if qkv_bias:
+            self.q_bias = nn.Parameter(torch.zeros(embed_dims))
+            self.v_bias = nn.Parameter(torch.zeros(embed_dims))
+        else:
+            self.q_bias = None
+            self.v_bias = None
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(embed_dims, embed_dims)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.softmax = nn.Softmax(dim=-1)
+    def _build_relative_coords_table(self):
+        Wh, Ww = self.window_size
+        coords_h = torch.arange(-(Wh - 1), Wh, dtype=torch.float32)
+        coords_w = torch.arange(-(Ww - 1), Ww, dtype=torch.float32)
+        # coords_table: (1, 2*Wh-1, 2*Ww-1, 2)
+        coords_table = torch.stack(
+            torch.meshgrid(coords_h, coords_w, indexing="ij"), dim=-1)
+        coords_table = coords_table.unsqueeze(0)
+        # Normalise
+        if self.pretrained_window_size[0] > 0:
+            coords_table[:, :, :, 0] /= (self.pretrained_window_size[0] - 1)
+            coords_table[:, :, :, 1] /= (self.pretrained_window_size[1] - 1)
+        else:
+            coords_table[:, :, :, 0] /= max(Wh - 1, 1)
+            coords_table[:, :, :, 1] /= max(Ww - 1, 1)
+        coords_table *= 8  # normalise to -8, 8
+        coords_table = (
+            torch.sign(coords_table)
+            * torch.log2(torch.abs(coords_table) + 1.0)
+            / math.log2(8)
+        )
+        self.register_buffer("relative_coords_table",
+                             coords_table.view(1, -1, 2))
+    def _build_relative_position_index(self):
+        Wh, Ww = self.window_size
+        coords_h = torch.arange(Wh)
+        coords_w = torch.arange(Ww)
+        coords = torch.stack(torch.meshgrid(coords_h, coords_w,
+                                            indexing="ij"))  # (2, Wh, Ww)
+        coords_flat = torch.flatten(coords, 1)                # (2, Wh*Ww)
+        relative = coords_flat[:, :, None] - coords_flat[:, None, :]
+        relative = relative.permute(1, 2, 0).contiguous()     # (N, N, 2)
+        relative[:, :, 0] += Wh - 1
+        relative[:, :, 1] += Ww - 1
+        relative[:, :, 0] *= 2 * Ww - 1
+        index = relative.sum(-1)  # (N, N)
+        self.register_buffer("relative_position_index", index)
+    def forward(self, x, mask=None):
+        """
+        Parameters
+        ----------
+        x : Tensor  (B*num_windows, N, C)  where N = Wh*Ww
+        mask : Tensor | None
+        """
+        B_, N, C = x.shape
+        # QKV with optional bias
+        if self.q_bias is not None:
+            qkv_bias = torch.cat([
+                self.q_bias,
+                torch.zeros_like(self.v_bias),
+                self.v_bias,
+            ])
+            qkv = F.linear(x, self.qkv.weight, qkv_bias)
+        else:
+            qkv = self.qkv(x)
+        qkv = qkv.reshape(B_, N, 3, self.num_heads,
+                           C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.unbind(0)
+        # Cosine attention
+        q = F.normalize(q, dim=-1)
+        k = F.normalize(k, dim=-1)
+        logit_scale = torch.clamp(self.logit_scale,
+                                  max=math.log(100.0)).exp()
+        attn = (q @ k.transpose(-2, -1)) * logit_scale
+        # Continuous position bias
+        relative_position_bias = self.cpb_mlp(
+            self.relative_coords_table).view(-1, self.num_heads)
+        index = self.relative_position_index.view(-1)
+        relative_position_bias = relative_position_bias[index].view(
+            N, N, -1)
+        relative_position_bias = relative_position_bias.permute(2, 0, 1)
+        # 16 * sigmoid for stable training
+        relative_position_bias = 16.0 * torch.sigmoid(
+            relative_position_bias)
+        attn = attn + relative_position_bias.unsqueeze(0)
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B_ // nW, nW, self.num_heads, N, N)
+            attn = attn + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+        attn = self.softmax(attn)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+# ---------------------------------------------------------------------------
+# ShiftWindowMSA  (Shifted-Window Multi-head Self-Attention)
+# ---------------------------------------------------------------------------
+def _window_partition(x, window_size):
+    """Partition feature map into non-overlapping windows.
+    x : (B, H, W, C)  →  (B * nH * nW, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    wh, ww = window_size
+    x = x.view(B, H // wh, wh, W // ww, ww, C)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, wh, ww, C)
+    return x
+def _window_reverse(windows, window_size, H, W):
+    """Reverse window partitioning.
+    windows : (B * nH * nW, wh, ww, C)  →  (B, H, W, C)
+    """
+    wh, ww = window_size
+    B = int(windows.shape[0] / (H // wh * W // ww))
+    x = windows.view(B, H // wh, W // ww, wh, ww, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    return x
+class ShiftWindowMSA(nn.Module):
+    """Shifted-Window Multi-head Self-Attention.
+    Parameters
+    ----------
+    embed_dims : int
+    num_heads : int
+    window_size : int
+    shift_size : int  (default 0)
+    dropout_layer : dict | None
+    pad_small_map : bool  (default False)
+    window_msa : type  (default WindowMSAV2)
+    msa_cfg : dict  (extra kwargs forwarded to window_msa)
+    """
+    def __init__(self, embed_dims, num_heads, window_size, shift_size=0,
+                 dropout_layer=None, pad_small_map=False,
+                 window_msa=WindowMSAV2, msa_cfg=None, **kwargs):
+        super().__init__()
+        if msa_cfg is None:
+            msa_cfg = {}
+        if isinstance(window_size, int):
+            window_size = (window_size, window_size)
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.pad_small_map = pad_small_map
+        self.w_msa = window_msa(
+            embed_dims=embed_dims,
+            num_heads=num_heads,
+            window_size=window_size,
+            **msa_cfg,
+        )
+        self.drop = _build_dropout(dropout_layer)
+    def forward(self, query, hw_shape):
+        B, L, C = query.shape
+        H, W = hw_shape
+        assert L == H * W, "input feature has wrong size"
+        query = query.view(B, H, W, C)
+        wh, ww = self.window_size
+        # Pad feature map if smaller than window
+        pad_r = (ww - W % ww) % ww
+        pad_b = (wh - H % wh) % wh
+        if pad_r > 0 or pad_b > 0:
+            query = F.pad(query, (0, 0, 0, pad_r, 0, pad_b))
+        Hp, Wp = query.shape[1], query.shape[2]
+        # Build attention mask for shifted windows
+        shift_size = self.shift_size
+        if min(Hp, Wp) <= max(wh, ww):
+            # Window is larger than feature map – no shift
+            shift_size = 0
+        attn_mask = None
+        if shift_size > 0:
+            query = torch.roll(query, shifts=(-shift_size, -shift_size),
+                               dims=(1, 2))
+            # Build mask
+            img_mask = query.new_zeros((1, Hp, Wp, 1))
+            h_slices = (slice(0, -wh),
+                        slice(-wh, -shift_size),
+                        slice(-shift_size, None))
+            w_slices = (slice(0, -ww),
+                        slice(-ww, -shift_size),
+                        slice(-shift_size, None))
+            cnt = 0
+            for h_s in h_slices:
+                for w_s in w_slices:
+                    img_mask[:, h_s, w_s, :] = cnt
+                    cnt += 1
+            mask_windows = _window_partition(img_mask, self.window_size)
+            mask_windows = mask_windows.view(-1,
+                                            wh * ww)
+            attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+            attn_mask = attn_mask.masked_fill(
+                attn_mask != 0, float(-100.0)).masked_fill(
+                attn_mask == 0, float(0.0))
+        # Partition into windows
+        x_windows = _window_partition(query, self.window_size)
+        x_windows = x_windows.view(-1, wh * ww, C)
+        # W-MSA / SW-MSA
+        attn_windows = self.w_msa(x_windows, mask=attn_mask)
+        # Merge windows back
+        attn_windows = attn_windows.view(-1, wh, ww, C)
+        x = _window_reverse(attn_windows, self.window_size, Hp, Wp)
+        # Reverse cyclic shift
+        if shift_size > 0:
+            x = torch.roll(x, shifts=(shift_size, shift_size), dims=(1, 2))
+        # Remove padding
+        if pad_r > 0 or pad_b > 0:
+            x = x[:, :H, :W, :].contiguous()
+        x = x.view(B, H * W, C)
+        x = self.drop(x)
+        return x
+# ---------------------------------------------------------------------------
+# deprecated_api_warning (no-op decorator for mmcv.utils)
+# ---------------------------------------------------------------------------
+def deprecated_api_warning(name_dict, cls_name=None):
+    """No-op decorator that simply returns the original function."""
+    def wrapper(old_func):
+        return old_func
+    return wrapper
+# ---------------------------------------------------------------------------
+# build_pixel_sampler stub (mmseg.core)
+# ---------------------------------------------------------------------------
+def build_pixel_sampler(cfg, context=None):
+    """Stub – returns None; pixel sampling is unused in this codebase."""
+    warnings.warn("build_pixel_sampler is a stub in compat.py; "
+                   "returning None.")
+    return None
+# ---------------------------------------------------------------------------
+# BaseBackbone alias (mmcls.models.backbones.base_backbone)
+# ---------------------------------------------------------------------------
+BaseBackbone = BaseModule
+# ---------------------------------------------------------------------------
+# Convenience re-exports
+# ---------------------------------------------------------------------------
+__all__ = [
+    # norm / init
+    "build_norm_layer",
+    "trunc_normal_",
+    "constant_init",
+    "kaiming_init",
+    # modules
+    "BaseModule",
+    "BaseBackbone",
+    "ModuleList",
+    "CheckpointLoader",
+    "load_state_dict",
+    "auto_fp16",
+    # layers
+    "DropPath",
+    "FFN",
+    "MultiheadAttention",
+    "ConvModule",
+    "PatchEmbed",
+    "PatchMerging",
+    "WindowMSAV2",
+    "ShiftWindowMSA",
+    # functions
+    "resize",
+    "resize_pos_embed",
+    "to_2tuple",
+    "deprecated_api_warning",
+    "build_pixel_sampler",
+    "get_root_logger",
+    # types
+    "_BatchNorm",
+]