Spaces:
Running on Zero
Running on Zero
| # Copyright (c) Meta Platforms, Inc. and affiliates. | |
| # | |
| # This source code is licensed under the Apache License, Version 2.0 | |
| # found in the LICENSE file in the root directory of this source tree. | |
| # References: | |
| # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py | |
| # https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py | |
| from typing import Callable, Optional, Tuple, Union | |
| import torch | |
| from torch import Tensor | |
| import torch.nn as nn | |
| def make_2tuple(x): | |
| if isinstance(x, tuple): | |
| assert len(x) == 2 | |
| return x | |
| assert isinstance(x, int) | |
| return (x, x) | |
| vgg = nn.Sequential( | |
| nn.Conv2d(3, 3, (1, 1)), | |
| nn.ReflectionPad2d((1, 1, 1, 1)), | |
| nn.Conv2d(3, 64, (3, 3)), | |
| nn.ReLU(), # relu1-1 | |
| nn.ReflectionPad2d((1, 1, 1, 1)), | |
| nn.Conv2d(64, 64, (3, 3)), | |
| nn.ReLU(), # relu1-2 | |
| nn.MaxPool2d((2, 2), (2, 2), (0, 0), ceil_mode=True), | |
| nn.ReflectionPad2d((1, 1, 1, 1)), | |
| nn.Conv2d(64, 128, (3, 3)), | |
| nn.ReLU(), # relu2-1 | |
| nn.ReflectionPad2d((1, 1, 1, 1)), | |
| nn.Conv2d(128, 128, (3, 3)), | |
| nn.ReLU(), # relu2-2 | |
| nn.MaxPool2d((2, 2), (2, 2), (0, 0), ceil_mode=True), | |
| nn.ReflectionPad2d((1, 1, 1, 1)), | |
| nn.Conv2d(128, 256, (3, 3)), | |
| nn.ReLU(), # relu3-1 | |
| nn.ReflectionPad2d((1, 1, 1, 1)), | |
| nn.Conv2d(256, 256, (3, 3)), | |
| nn.ReLU(), # relu3-2 | |
| nn.ReflectionPad2d((1, 1, 1, 1)), | |
| nn.Conv2d(256, 256, (3, 3)), | |
| nn.ReLU(), # relu3-3 | |
| nn.ReflectionPad2d((1, 1, 1, 1)), | |
| nn.Conv2d(256, 256, (3, 3)), | |
| nn.ReLU(), # relu3-4 | |
| nn.MaxPool2d((2, 2), (2, 2), (0, 0), ceil_mode=True), | |
| nn.ReflectionPad2d((1, 1, 1, 1)), | |
| nn.Conv2d(256, 512, (3, 3)), | |
| nn.ReLU(), # relu4-1, this is the last layer used | |
| nn.ReflectionPad2d((1, 1, 1, 1)), | |
| nn.Conv2d(512, 512, (3, 3)), | |
| nn.ReLU(), # relu4-2 | |
| nn.ReflectionPad2d((1, 1, 1, 1)), | |
| nn.Conv2d(512, 512, (3, 3)), | |
| nn.ReLU(), # relu4-3 | |
| nn.ReflectionPad2d((1, 1, 1, 1)), | |
| nn.Conv2d(512, 512, (3, 3)), | |
| nn.ReLU(), # relu4-4 | |
| nn.MaxPool2d((2, 2), (2, 2), (0, 0), ceil_mode=True), | |
| nn.ReflectionPad2d((1, 1, 1, 1)), | |
| nn.Conv2d(512, 512, (3, 3)), | |
| nn.ReLU(), # relu5-1 | |
| nn.ReflectionPad2d((1, 1, 1, 1)), | |
| nn.Conv2d(512, 512, (3, 3)), | |
| nn.ReLU(), # relu5-2 | |
| nn.ReflectionPad2d((1, 1, 1, 1)), | |
| nn.Conv2d(512, 512, (3, 3)), | |
| nn.ReLU(), # relu5-3 | |
| nn.ReflectionPad2d((1, 1, 1, 1)), | |
| nn.Conv2d(512, 512, (3, 3)), | |
| nn.ReLU() # relu5-4 | |
| ) | |
| class FeatureExtractor(nn.Module): | |
| def __init__(self, encoder): | |
| super(FeatureExtractor, self).__init__() | |
| enc_layers = list(encoder.children()) | |
| self.enc_1 = nn.Sequential(*enc_layers[:4]) # input -> relu1_1 | |
| self.enc_2 = nn.Sequential(*enc_layers[4:11]) # relu1_1 -> relu2_1 | |
| self.enc_3 = nn.Sequential(*enc_layers[11:18]) # relu2_1 -> relu3_1 | |
| self.enc_4 = nn.Sequential(*enc_layers[18:31]) # relu3_1 -> relu4_1 | |
| self.mse_loss = nn.MSELoss() | |
| # fix the encoder | |
| for name in ['enc_1', 'enc_2', 'enc_3', 'enc_4']: | |
| for param in getattr(self, name).parameters(): | |
| param.requires_grad = False | |
| # extract relu1_1, relu2_1, relu3_1, relu4_1 from input image | |
| def forward(self, input): | |
| results = [input] | |
| for i in range(4): | |
| func = getattr(self, 'enc_{:d}'.format(i + 1)) | |
| results.append(func(results[-1])) | |
| # resize the results to the same size | |
| for i in range(1, len(results)): | |
| results[i] = nn.functional.interpolate( | |
| results[i], size=results[0].shape[2:], mode='bilinear', align_corners=False | |
| ) | |
| results = torch.concat(results, dim=1) # stack relu1_1, relu2_1, relu3_1, relu4_1 | |
| return results | |
| class VGG19PatchEmbed(nn.Module): | |
| """ | |
| 2D image to patch embedding: (B,C,H,W) -> (B,N,D) | |
| Args: | |
| img_size: Image size. | |
| patch_size: Patch token size. | |
| in_chans: Number of input image channels. | |
| embed_dim: Number of linear projection output channels. | |
| norm_layer: Normalization layer. | |
| """ | |
| def __init__( | |
| self, | |
| img_size: Union[int, Tuple[int, int]] = 224, | |
| patch_size: Union[int, Tuple[int, int]] = 16, | |
| in_chans: int = 3, | |
| embed_dim: int = 768, | |
| norm_layer: Optional[Callable] = None, | |
| flatten_embedding: bool = True, | |
| ) -> None: | |
| super().__init__() | |
| vgg.load_state_dict(torch.load("checkpoints/vgg_normalised.pth")) | |
| self.feature_extractor = FeatureExtractor(vgg) | |
| self.feature_extractor.eval() # set to eval mode | |
| self.feature_extractor.requires_grad_(False) | |
| image_HW = make_2tuple(img_size) | |
| patch_HW = make_2tuple(patch_size) | |
| patch_grid_size = ( | |
| image_HW[0] // patch_HW[0], | |
| image_HW[1] // patch_HW[1], | |
| ) | |
| self.img_size = image_HW | |
| self.patch_size = patch_HW | |
| self.patches_resolution = patch_grid_size | |
| self.num_patches = patch_grid_size[0] * patch_grid_size[1] | |
| self.in_chans = 3 + 64 + 128 + 256 + 512 # relu1-1, relu2-1, relu3-1, relu4-1 | |
| self.embed_dim = embed_dim | |
| self.flatten_embedding = flatten_embedding | |
| self.proj = nn.Conv2d(self.in_chans, self.embed_dim, kernel_size=patch_HW, stride=patch_HW) | |
| self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity() | |
| def forward(self, x: Tensor) -> Tensor: | |
| _, _, H, W = x.shape | |
| patch_H, patch_W = self.patch_size | |
| assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}" | |
| assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}" | |
| x = self.feature_extractor(x) # B C H W, where C = 3 + 64 + 128 + 256 + 512 | |
| x = self.proj(x) | |
| H, W = x.size(2), x.size(3) | |
| x = x.flatten(2).transpose(1, 2) # B HW C | |
| x = self.norm(x) | |
| if not self.flatten_embedding: | |
| x = x.reshape(-1, H, W, self.embed_dim) # B H W C | |
| return x | |
| def flops(self) -> float: | |
| Ho, Wo = self.patches_resolution | |
| flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1]) | |
| if self.norm is not None: | |
| flops += Ho * Wo * self.embed_dim | |
| return flops | |