|
|
import torch |
|
|
import torch.nn as nn |
|
|
from torchvision.models import mobilenet_v2 |
|
|
|
|
|
|
|
|
|
|
|
LATENT_DIM = 1280 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _make_divisible(v, divisor, min_value=None): |
|
|
if min_value is None: |
|
|
min_value = divisor |
|
|
new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) |
|
|
|
|
|
if new_v < 0.9 * v: |
|
|
new_v += divisor |
|
|
return new_v |
|
|
|
|
|
class ConvBNReLU(nn.Sequential): |
|
|
def __init__(self, in_planes, out_planes, kernel_size=3, stride=1, groups=1, norm_layer=nn.BatchNorm2d): |
|
|
padding = (kernel_size - 1) // 2 |
|
|
super().__init__( |
|
|
nn.Conv2d(in_planes, out_planes, kernel_size, stride, padding, groups=groups, bias=False), |
|
|
norm_layer(out_planes), |
|
|
nn.ReLU6(inplace=True) |
|
|
) |
|
|
|
|
|
class SqueezeExcitation(nn.Module): |
|
|
def __init__(self, input_channels, squeeze_factor=4): |
|
|
super().__init__() |
|
|
squeeze_channels = _make_divisible(input_channels // squeeze_factor, 8) |
|
|
self.avgpool = nn.AdaptiveAvgPool2d(1) |
|
|
|
|
|
self.conv_reduce = nn.Conv2d(input_channels, squeeze_channels, 1, bias=True) |
|
|
self.conv_expand = nn.Conv2d(squeeze_channels, input_channels, 1, bias=True) |
|
|
|
|
|
def forward(self, x): |
|
|
scale = self.avgpool(x) |
|
|
scale = self.conv_reduce(scale) |
|
|
scale = nn.ReLU(inplace=True)(scale) |
|
|
scale = self.conv_expand(scale) |
|
|
scale = nn.Sigmoid()(scale) |
|
|
return x * scale |
|
|
|
|
|
class InvertedResidual(nn.Module): |
|
|
def __init__(self, in_chs, out_chs, stride, expand_ratio, se_layer=None): |
|
|
super().__init__() |
|
|
hidden_dim = in_chs * expand_ratio |
|
|
self.use_res_connect = stride == 1 and in_chs == out_chs |
|
|
norm_layer = nn.BatchNorm2d |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
layers = [] |
|
|
if expand_ratio != 1: |
|
|
|
|
|
layers.extend([ |
|
|
nn.Conv2d(in_chs, hidden_dim, 1, 1, 0, bias=False), |
|
|
norm_layer(hidden_dim), |
|
|
nn.ReLU6(inplace=True), |
|
|
]) |
|
|
self.conv_pw = nn.Sequential(*layers[:2]) |
|
|
|
|
|
|
|
|
self.conv_dw = nn.Sequential( |
|
|
nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False), |
|
|
norm_layer(hidden_dim), |
|
|
nn.ReLU6(inplace=True) |
|
|
) |
|
|
|
|
|
|
|
|
self.se = se_layer(hidden_dim) if se_layer else nn.Identity() |
|
|
|
|
|
|
|
|
self.conv_pwl = nn.Sequential( |
|
|
nn.Conv2d(hidden_dim, out_chs, 1, 1, 0, bias=False), |
|
|
norm_layer(out_chs) |
|
|
) |
|
|
|
|
|
def forward(self, x): |
|
|
if self.use_res_connect: |
|
|
|
|
|
return x + self.conv_pwl(self.se(self.conv_dw(self.conv_pw(x)))) |
|
|
else: |
|
|
return self.conv_pwl(self.se(self.conv_dw(self.conv_pw(x)))) |
|
|
|
|
|
|
|
|
|
|
|
class DeepSVDD(nn.Module): |
|
|
""" |
|
|
Deep SVDD model with manually reconstructed MobileNetV3-like structure |
|
|
to match the checkpoint's layer names (conv_stem, blocks.X.Y.conv_pw, etc.). |
|
|
""" |
|
|
def __init__(self, latent_dim=LATENT_DIM): |
|
|
super().__init__() |
|
|
norm_layer = nn.BatchNorm2d |
|
|
|
|
|
|
|
|
inverted_residual_setting = [ |
|
|
|
|
|
[1, 16, 1, 1, False], |
|
|
[6, 24, 2, 2, False], |
|
|
[6, 32, 3, 2, False], |
|
|
[6, 64, 4, 2, True], |
|
|
[6, 96, 3, 1, True], |
|
|
[6, 160, 3, 2, True], |
|
|
[6, 320, 1, 1, True], |
|
|
] |
|
|
|
|
|
|
|
|
input_channel = 32 |
|
|
self.conv_stem = nn.Conv2d(3, input_channel, kernel_size=3, stride=2, padding=1, bias=False) |
|
|
self.bn1 = norm_layer(input_channel) |
|
|
|
|
|
|
|
|
blocks = nn.ModuleList() |
|
|
current_in_channels = input_channel |
|
|
|
|
|
for t, c, n, s, se in inverted_residual_setting: |
|
|
out_channel = _make_divisible(c * 1.0, 8) |
|
|
se_layer = SqueezeExcitation if se else None |
|
|
|
|
|
|
|
|
blocks.append(InvertedResidual(current_in_channels, out_channel, s, t, se_layer)) |
|
|
current_in_channels = out_channel |
|
|
|
|
|
|
|
|
for i in range(n - 1): |
|
|
blocks.append(InvertedResidual(current_in_channels, out_channel, 1, t, se_layer)) |
|
|
current_in_channels = out_channel |
|
|
|
|
|
|
|
|
output_channel = 1280 |
|
|
self.conv_head = nn.Conv2d(current_in_channels, output_channel, 1, 1, 0, bias=False) |
|
|
self.bn2 = norm_layer(output_channel) |
|
|
|
|
|
|
|
|
self.spatial_encoder = nn.Sequential( |
|
|
ConvBNReLU(3, input_channel, stride=2, norm_layer=norm_layer), |
|
|
*blocks, |
|
|
nn.Sequential( |
|
|
self.conv_head, |
|
|
self.bn2 |
|
|
) |
|
|
) |
|
|
|
|
|
|
|
|
self.avgpool = nn.AdaptiveAvgPool2d(1) |
|
|
|
|
|
def forward(self, x): |
|
|
|
|
|
|
|
|
|
|
|
x = self.spatial_encoder(x) |
|
|
|
|
|
x = self.avgpool(x) |
|
|
x = torch.flatten(x, 1) |
|
|
|
|
|
return x |