Spaces:

sarlinpe
/

OrienterNet

Running

App Files Files Community

OrienterNet / maploc /models /feature_extractor.py

sarlinpe

Release

9665c2c over 2 years ago

raw

history blame contribute delete

9.37 kB

	# Copyright (c) Meta Platforms, Inc. and affiliates.

	# Adapted from PixLoc, Paul-Edouard Sarlin, ETH Zurich
	# https://github.com/cvg/pixloc
	# Released under the Apache License 2.0

	"""
	Flexible UNet model which takes any Torchvision backbone as encoder.
	Predicts multi-level feature and makes sure that they are well aligned.
	"""

	import torch
	import torch.nn as nn
	import torchvision

	from .base import BaseModel
	from .utils import checkpointed


	class DecoderBlock(nn.Module):
	def __init__(
	self, previous, skip, out, num_convs=1, norm=nn.BatchNorm2d, padding="zeros"
	):
	super().__init__()

	self.upsample = nn.Upsample(
	scale_factor=2, mode="bilinear", align_corners=False
	)

	layers = []
	for i in range(num_convs):
	conv = nn.Conv2d(
	previous + skip if i == 0 else out,
	out,
	kernel_size=3,
	padding=1,
	bias=norm is None,
	padding_mode=padding,
	)
	layers.append(conv)
	if norm is not None:
	layers.append(norm(out))
	layers.append(nn.ReLU(inplace=True))
	self.layers = nn.Sequential(*layers)

	def forward(self, previous, skip):
	upsampled = self.upsample(previous)
	# If the shape of the input map `skip` is not a multiple of 2,
	# it will not match the shape of the upsampled map `upsampled`.
	# If the downsampling uses ceil_mode=False, we nedd to crop `skip`.
	# If it uses ceil_mode=True (not supported here), we should pad it.
	_, _, hu, wu = upsampled.shape
	_, _, hs, ws = skip.shape
	assert (hu <= hs) and (wu <= ws), "Using ceil_mode=True in pooling?"
	# assert (hu == hs) and (wu == ws), 'Careful about padding'
	skip = skip[:, :, :hu, :wu]
	return self.layers(torch.cat([upsampled, skip], dim=1))


	class AdaptationBlock(nn.Sequential):
	def __init__(self, inp, out):
	conv = nn.Conv2d(inp, out, kernel_size=1, padding=0, bias=True)
	super().__init__(conv)


	class FeatureExtractor(BaseModel):
	default_conf = {
	"pretrained": True,
	"input_dim": 3,
	"output_scales": [0, 2, 4], # what scales to adapt and output
	"output_dim": 128, # # of channels in output feature maps
	"encoder": "vgg16", # string (torchvision net) or list of channels
	"num_downsample": 4, # how many downsample block (if VGG-style net)
	"decoder": [64, 64, 64, 64], # list of channels of decoder
	"decoder_norm": "nn.BatchNorm2d", # normalization ind decoder blocks
	"do_average_pooling": False,
	"checkpointed": False, # whether to use gradient checkpointing
	"padding": "zeros",
	}
	mean = [0.485, 0.456, 0.406]
	std = [0.229, 0.224, 0.225]

	def build_encoder(self, conf):
	assert isinstance(conf.encoder, str)
	if conf.pretrained:
	assert conf.input_dim == 3
	Encoder = getattr(torchvision.models, conf.encoder)
	encoder = Encoder(weights="DEFAULT" if conf.pretrained else None)
	Block = checkpointed(torch.nn.Sequential, do=conf.checkpointed)
	assert max(conf.output_scales) <= conf.num_downsample

	if conf.encoder.startswith("vgg"):
	# Parse the layers and pack them into downsampling blocks
	# It's easy for VGG-style nets because of their linear structure.
	# This does not handle strided convs and residual connections
	skip_dims = []
	previous_dim = None
	blocks = [[]]
	for i, layer in enumerate(encoder.features):
	if isinstance(layer, torch.nn.Conv2d):
	# Change the first conv layer if the input dim mismatches
	if i == 0 and conf.input_dim != layer.in_channels:
	args = {k: getattr(layer, k) for k in layer.__constants__}
	args.pop("output_padding")
	layer = torch.nn.Conv2d(
	{args, "in_channels": conf.input_dim}
	)
	previous_dim = layer.out_channels
	elif isinstance(layer, torch.nn.MaxPool2d):
	assert previous_dim is not None
	skip_dims.append(previous_dim)
	if (conf.num_downsample + 1) == len(blocks):
	break
	blocks.append([]) # start a new block
	if conf.do_average_pooling:
	assert layer.dilation == 1
	layer = torch.nn.AvgPool2d(
	kernel_size=layer.kernel_size,
	stride=layer.stride,
	padding=layer.padding,
	ceil_mode=layer.ceil_mode,
	count_include_pad=False,
	)
	blocks[-1].append(layer)
	encoder = [Block(*b) for b in blocks]
	elif conf.encoder.startswith("resnet"):
	# Manually define the ResNet blocks such that the downsampling comes first
	assert conf.encoder[len("resnet") :] in ["18", "34", "50", "101"]
	assert conf.input_dim == 3, "Unsupported for now."
	block1 = torch.nn.Sequential(encoder.conv1, encoder.bn1, encoder.relu)
	block2 = torch.nn.Sequential(encoder.maxpool, encoder.layer1)
	block3 = encoder.layer2
	block4 = encoder.layer3
	block5 = encoder.layer4
	blocks = [block1, block2, block3, block4, block5]
	# Extract the output dimension of each block
	skip_dims = [encoder.conv1.out_channels]
	for i in range(1, 5):
	modules = getattr(encoder, f"layer{i}")[-1]._modules
	conv = sorted(k for k in modules if k.startswith("conv"))[-1]
	skip_dims.append(modules[conv].out_channels)
	# Add a dummy block such that the first one does not downsample
	encoder = [torch.nn.Identity()] + [Block(b) for b in blocks]
	skip_dims = [3] + skip_dims
	# Trim based on the requested encoder size
	encoder = encoder[: conf.num_downsample + 1]
	skip_dims = skip_dims[: conf.num_downsample + 1]
	else:
	raise NotImplementedError(conf.encoder)

	assert (conf.num_downsample + 1) == len(encoder)
	encoder = nn.ModuleList(encoder)

	return encoder, skip_dims

	def _init(self, conf):
	# Encoder
	self.encoder, skip_dims = self.build_encoder(conf)
	self.skip_dims = skip_dims

	def update_padding(module):
	if isinstance(module, nn.Conv2d):
	module.padding_mode = conf.padding

	if conf.padding != "zeros":
	self.encoder.apply(update_padding)

	# Decoder
	if conf.decoder is not None:
	assert len(conf.decoder) == (len(skip_dims) - 1)
	Block = checkpointed(DecoderBlock, do=conf.checkpointed)
	norm = eval(conf.decoder_norm) if conf.decoder_norm else None # noqa

	previous = skip_dims[-1]
	decoder = []
	for out, skip in zip(conf.decoder, skip_dims[:-1][::-1]):
	decoder.append(
	Block(previous, skip, out, norm=norm, padding=conf.padding)
	)
	previous = out
	self.decoder = nn.ModuleList(decoder)

	# Adaptation layers
	adaptation = []
	for idx, i in enumerate(conf.output_scales):
	if conf.decoder is None or i == (len(self.encoder) - 1):
	input_ = skip_dims[i]
	else:
	input_ = conf.decoder[-1 - i]

	# out_dim can be an int (same for all scales) or a list (per scale)
	dim = conf.output_dim
	if not isinstance(dim, int):
	dim = dim[idx]

	block = AdaptationBlock(input_, dim)
	adaptation.append(block)
	self.adaptation = nn.ModuleList(adaptation)
	self.scales = [2**s for s in conf.output_scales]

	def _forward(self, data):
	image = data["image"]
	if self.conf.pretrained:
	mean, std = image.new_tensor(self.mean), image.new_tensor(self.std)
	image = (image - mean[:, None, None]) / std[:, None, None]

	skip_features = []
	features = image
	for block in self.encoder:
	features = block(features)
	skip_features.append(features)

	if self.conf.decoder:
	pre_features = [skip_features[-1]]
	for block, skip in zip(self.decoder, skip_features[:-1][::-1]):
	pre_features.append(block(pre_features[-1], skip))
	pre_features = pre_features[::-1] # fine to coarse
	else:
	pre_features = skip_features

	out_features = []
	for adapt, i in zip(self.adaptation, self.conf.output_scales):
	out_features.append(adapt(pre_features[i]))
	pred = {"feature_maps": out_features, "skip_features": skip_features}
	return pred

	def loss(self, pred, data):
	raise NotImplementedError

	def metrics(self, pred, data):
	raise NotImplementedError