File size: 3,901 Bytes
984cdba |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 |
import torch
import torch.nn as nn
import torch.nn.functional as F
from einops import rearrange
import timm
# ---------------------------------------------------------
# Basic CNN Blocks
# ---------------------------------------------------------
class DoubleConv(nn.Module):
def __init__(self, in_ch, out_ch):
super().__init__()
self.block = nn.Sequential(
nn.Conv2d(in_ch, out_ch, 3, padding=1, bias=False),
nn.BatchNorm2d(out_ch),
nn.ReLU(inplace=True),
nn.Conv2d(out_ch, out_ch, 3, padding=1, bias=False),
nn.BatchNorm2d(out_ch),
nn.ReLU(inplace=True),
)
def forward(self, x):
return self.block(x)
class UpBlock(nn.Module):
"""
Upsample (bilinear) + concat skip + DoubleConv
NO transposed convolutions → no grid artifacts
"""
def __init__(self, in_ch, skip_ch, out_ch):
super().__init__()
self.conv = DoubleConv(in_ch + skip_ch, out_ch)
def forward(self, x, skip):
x = F.interpolate(x, size=skip.shape[2:], mode="bilinear", align_corners=False)
x = torch.cat([x, skip], dim=1)
return self.conv(x)
# ---------------------------------------------------------
# SwinV2 + CNN Decoder
# ---------------------------------------------------------
class model(nn.Module):
def __init__(
self,
in_channels=3,
num_classes=15,
freeze_encoder=False,
):
super().__init__()
# -------------------------------
# Encoder (SwinV2)
# -------------------------------
self.encoder = timm.create_model(
"swinv2_tiny_window8_256",
pretrained=True,
features_only=True,
out_indices=(0, 1, 2, 3),
)
if freeze_encoder:
for p in self.encoder.parameters():
p.requires_grad = False
# Replace patch embedding to accept custom input channels
old_proj = self.encoder.patch_embed.proj
self.encoder.patch_embed.proj = nn.Conv2d(
in_channels=in_channels,
out_channels=old_proj.out_channels,
kernel_size=old_proj.kernel_size,
stride=old_proj.stride,
padding=old_proj.padding,
bias=old_proj.bias is not None,
)
# Encoder channel sizes
c0, c1, c2, c3 = self.encoder.feature_info.channels()
# -------------------------------
# CNN Decoder (artifact-free)
# -------------------------------
self.up3 = UpBlock(c3, c2, c2) # 1/32 → 1/16
self.up2 = UpBlock(c2, c1, c1) # 1/16 → 1/8
self.up1 = UpBlock(c1, c0, c0) # 1/8 → 1/4
self.refine = nn.Sequential(
nn.Conv2d(c0, c0, 3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(c0, c0, 3, padding=1),
nn.ReLU(inplace=True),
)
self.head = nn.Conv2d(c0, num_classes, kernel_size=1)
# ---------------------------------------------------------
# Forward
# ---------------------------------------------------------
def forward(self, x):
f0, f1, f2, f3 = self.encoder(x)
# Swin outputs are (B, H, W, C)
f0 = rearrange(f0, "b h w c -> b c h w")
f1 = rearrange(f1, "b h w c -> b c h w")
f2 = rearrange(f2, "b h w c -> b c h w")
f3 = rearrange(f3, "b h w c -> b c h w")
# Decoder
d3 = self.up3(f3, f2)
d2 = self.up2(d3, f1)
d1 = self.up1(d2, f0)
d1 = self.refine(d1)
out = F.interpolate(
d1, size=x.shape[2:], mode="bilinear", align_corners=False
)
return self.head(out)
|