doduo / matching.py

Upload model

edba1de over 2 years ago

16.4 kB

	import torch
	import torch.nn.functional as F

	from .geometry import coords_grid, generate_window_grid, normalize_coords


	def global_correlation_softmax_prototype(
	feature0,
	feature1,
	value,
	pred_bidir_flow=False,
	corr_mask=None,
	):
	"""
	feature0: [B, C, H, W]
	feature1: [B, C, H, W]
	value: [B, C1, H, W]
	corr_mask: [B, HW, HW] or None, if not None, the value will be masked out
	"""
	# global correlation
	b, c, h, w = feature0.shape
	c_value = value.size(1)
	value = value.view(b, c_value, -1).permute(0, 2, 1) # [B, H*W, C1]

	feature0 = feature0.view(b, c, -1).permute(0, 2, 1) # [B, H*W, C]
	feature1 = feature1.view(b, c, -1) # [B, C, H*W]

	correlation = torch.matmul(feature0, feature1).view(b, h, w, h, w) / (
	c**0.5
	) # [B, H, W, H, W]

	correlation = correlation.view(b, h * w, h * w) # [B, HW, HW]

	if pred_bidir_flow:
	correlation = torch.cat(
	(correlation, correlation.permute(0, 2, 1)), dim=0
	) # [2B, HW, H*W]
	value = value.repeat(2, 1, 1) # [2B, HW, 2]
	b = b * 2

	if corr_mask is not None:
	# mask out the correlation with corr_mask
	if corr_mask.dtype == torch.bool:
	# binary mask
	correlation[corr_mask] = -65504.0
	prob = F.softmax(correlation, dim=-1) # [B, HW, HW]
	else:
	# float mask
	# bin_mask = corr_mask < 0
	# correlation[bin_mask] = -65504.0
	prob = F.softmax(correlation, dim=-1) # [B, HW, HW]
	# soft_mask = torch.clamp(corr_mask, min=0)
	prob = prob * corr_mask
	# normalize
	prob = prob / (prob.sum(dim=2, keepdim=True) + 1e-8)
	else:
	prob = F.softmax(correlation, dim=-1) # [B, HW, HW]
	# if corr_mask.dtype == torch.bool:
	# # binary mask
	# bin_mask = corr_mask
	# num_true = bin_mask.sum(-1).unique().item()
	# soft_mask = torch.ones((b, h * w, num_true)).to(corr_mask.device)
	# else:
	# # float mask
	# bin_mask = corr_mask >= 0
	# num_true = bin_mask.sum(-1).unique().item()
	# soft_mask = corr_mask[bin_mask].view(b, h * w, num_true)
	# assert soft_mask.min() >= 0
	# correlation_seleted = correlation[bin_mask].view(b, h * w, num_true)
	# prob_seleted = F.softmax(correlation_seleted, dim=-1) # [B, H*W, num_true]
	# prob_seleted = soft_mask * prob_seleted
	# prob_seleted = prob_seleted / (prob_seleted.sum(dim=2, keepdim=True) + 1e-8)
	# prob = torch.zeros_like(correlation)
	# prob[bin_mask] = prob_seleted.view(-1)
	# else:
	# prob = F.softmax(correlation, dim=-1)

	result = torch.matmul(prob, value).view(b, h, w, c_value).permute(0, 3, 1, 2) # [B, 2, H, W]\
	return result, correlation


	def local_correlation_softmax_prototype(
	feature0,
	feature1,
	value,
	radius=5,
	pred_bidir_flow=False,
	corr_mask=None,
	):
	"""
	softmax around argmax point
	feature0: [B, C, H, W]
	feature1: [B, C, H, W]
	value: [B, C1, H, W]
	corr_mask: [B, HW, HW] or None, if not None, the value will be masked out
	"""
	b, c, h, w = feature0.shape
	c_value = value.size(1)
	value = value.view(b, c_value, -1).permute(0, 2, 1) # [B, H*W, C1]

	feature0 = feature0.view(b, c, -1).permute(0, 2, 1) # [B, H*W, C]
	feature1 = feature1.view(b, c, -1) # [B, C, H*W]

	correlation = torch.matmul(feature0, feature1).view(b, h, w, h, w) / (
	c**0.5
	) # [B, H, W, H, W]

	correlation = correlation.view(b, h * w, h * w) # [B, HW, HW]

	if pred_bidir_flow:
	correlation = torch.cat(
	(correlation, correlation.permute(0, 2, 1)), dim=0
	) # [2B, HW, H*W]
	value = value.repeat(2, 1, 1) # [2B, HW, 2]
	b = b * 2

	if corr_mask is not None:
	# mask out the correlation with corr_mask
	if corr_mask.dtype == torch.bool:
	# binary mask
	correlation[corr_mask] = -65504.0
	prob = F.softmax(correlation, dim=-1) # [B, HW, HW]
	else:
	# float mask
	# bin_mask = corr_mask < 0
	# correlation[bin_mask] = -65504.0
	prob = F.softmax(correlation, dim=-1) # [B, HW, HW]
	# soft_mask = torch.clamp(corr_mask, min=0)
	prob = prob * corr_mask
	else:
	prob = F.softmax(correlation, dim=-1) # [B, HW, HW]

	# get local prob
	# B, H*W, 2
	coords = coords_grid(b, h, w, device=feature0.device).flatten(2).permute(0, 2, 1)
	# B, HW, HW, 2
	coords = coords.unsqueeze(1).repeat(1, h * w, 1, 1)
	# B, H*W
	argmax_pos = torch.argmax(prob, dim=2)
	# B, H*W, 1, 2
	argmax_pos = argmax_pos.view(b, h * w, 1, 1).repeat(1, 1, 1, 2)
	# B, H*W, 1, 2
	pos = torch.gather(coords, 2, argmax_pos)
	# B, HW, HW
	valid = ((coords - pos).square().sum(dim=-1) < (radius**2)).float()
	prob = prob * valid

	# normalize
	prob = prob / (prob.sum(dim=2, keepdim=True) + 1e-8)
	result = torch.matmul(prob, value).view(b, h, w, c_value).permute(0, 3, 1, 2) # [B, 2, H, W]\
	return result, correlation


	def global_correlation_softmax(
	feature0,
	feature1,
	pred_bidir_flow=False,
	):
	# global correlation
	b, c, h, w = feature0.shape
	feature0 = feature0.view(b, c, -1).permute(0, 2, 1) # [B, H*W, C]
	feature1 = feature1.view(b, c, -1) # [B, C, H*W]

	correlation = torch.matmul(feature0, feature1).view(b, h, w, h, w) / (
	c**0.5
	) # [B, H, W, H, W]

	# flow from softmax
	init_grid = coords_grid(b, h, w).to(correlation.device) # [B, 2, H, W]
	grid = init_grid.view(b, 2, -1).permute(0, 2, 1) # [B, H*W, 2]

	correlation = correlation.view(b, h * w, h * w) # [B, HW, HW]

	if pred_bidir_flow:
	correlation = torch.cat(
	(correlation, correlation.permute(0, 2, 1)), dim=0
	) # [2B, HW, H*W]
	init_grid = init_grid.repeat(2, 1, 1, 1) # [2*B, 2, H, W]
	grid = grid.repeat(2, 1, 1) # [2B, HW, 2]
	b = b * 2

	prob = F.softmax(correlation, dim=-1) # [B, HW, HW]

	correspondence = torch.matmul(prob, grid).view(b, h, w, 2).permute(0, 3, 1, 2) # [B, 2, H, W]

	# when predicting bidirectional flow, flow is the concatenation of forward flow and backward flow
	flow = correspondence - init_grid

	return flow, prob


	def local_correlation_softmax(
	feature0,
	feature1,
	local_radius,
	padding_mode="zeros",
	):
	b, c, h, w = feature0.size()
	coords_init = coords_grid(b, h, w).to(feature0.device) # [B, 2, H, W]
	coords = coords_init.view(b, 2, -1).permute(0, 2, 1) # [B, H*W, 2]

	local_h = 2 * local_radius + 1
	local_w = 2 * local_radius + 1

	window_grid = generate_window_grid(
	-local_radius,
	local_radius,
	-local_radius,
	local_radius,
	local_h,
	local_w,
	device=feature0.device,
	) # [2R+1, 2R+1, 2]
	window_grid = window_grid.reshape(-1, 2).repeat(b, 1, 1, 1) # [B, 1, (2R+1)^2, 2]
	sample_coords = coords.unsqueeze(-2) + window_grid # [B, H*W, (2R+1)^2, 2]

	sample_coords_softmax = sample_coords

	# exclude coords that are out of image space
	valid_x = (sample_coords[:, :, :, 0] >= 0) & (
	sample_coords[:, :, :, 0] < w
	) # [B, H*W, (2R+1)^2]
	valid_y = (sample_coords[:, :, :, 1] >= 0) & (
	sample_coords[:, :, :, 1] < h
	) # [B, H*W, (2R+1)^2]

	valid = valid_x & valid_y # [B, H*W, (2R+1)^2], used to mask out invalid values when softmax

	# normalize coordinates to [-1, 1]
	sample_coords_norm = normalize_coords(sample_coords, h, w) # [-1, 1]
	window_feature = F.grid_sample(
	feature1, sample_coords_norm, padding_mode=padding_mode, align_corners=False
	).permute(
	0, 2, 1, 3
	) # [B, H*W, C, (2R+1)^2]
	feature0_view = feature0.permute(0, 2, 3, 1).view(b, h * w, 1, c) # [B, H*W, 1, C]

	corr = torch.matmul(feature0_view, window_feature).view(b, h * w, -1) / (
	c**0.5
	) # [B, H*W, (2R+1)^2]

	# mask invalid locations
	corr[~valid] = -1e9

	prob = F.softmax(corr, -1) # [B, H*W, (2R+1)^2]

	correspondence = (
	torch.matmul(prob.unsqueeze(-2), sample_coords_softmax)
	.squeeze(-2)
	.view(b, h, w, 2)
	.permute(0, 3, 1, 2)
	) # [B, 2, H, W]

	flow = correspondence - coords_init
	match_prob = prob

	return flow, match_prob


	def local_correlation_with_flow(
	feature0,
	feature1,
	flow,
	local_radius,
	padding_mode="zeros",
	dilation=1,
	):
	b, c, h, w = feature0.size()
	coords_init = coords_grid(b, h, w).to(feature0.device) # [B, 2, H, W]
	coords = coords_init.view(b, 2, -1).permute(0, 2, 1) # [B, H*W, 2]

	local_h = 2 * local_radius + 1
	local_w = 2 * local_radius + 1

	window_grid = generate_window_grid(
	-local_radius,
	local_radius,
	-local_radius,
	local_radius,
	local_h,
	local_w,
	device=feature0.device,
	) # [2R+1, 2R+1, 2]
	window_grid = window_grid.reshape(-1, 2).repeat(b, 1, 1, 1) # [B, 1, (2R+1)^2, 2]
	sample_coords = coords.unsqueeze(-2) + window_grid * dilation # [B, H*W, (2R+1)^2, 2]

	# flow can be zero when using features after transformer
	if not isinstance(flow, float):
	sample_coords = sample_coords + flow.view(b, 2, -1).permute(0, 2, 1).unsqueeze(
	-2
	) # [B, H*W, (2R+1)^2, 2]
	else:
	assert flow == 0.0

	# normalize coordinates to [-1, 1]
	sample_coords_norm = normalize_coords(sample_coords, h, w) # [-1, 1]
	window_feature = F.grid_sample(
	feature1, sample_coords_norm, padding_mode=padding_mode, align_corners=False
	).permute(
	0, 2, 1, 3
	) # [B, H*W, C, (2R+1)^2]
	feature0_view = feature0.permute(0, 2, 3, 1).view(b, h * w, 1, c) # [B, H*W, 1, C]

	corr = torch.matmul(feature0_view, window_feature).view(b, h * w, -1) / (
	c**0.5
	) # [B, H*W, (2R+1)^2]

	corr = corr.view(b, h, w, -1).permute(0, 3, 1, 2).contiguous() # [B, (2R+1)^2, H, W]

	return corr


	def global_correlation_softmax_stereo(
	feature0,
	feature1,
	):
	# global correlation on horizontal direction
	b, c, h, w = feature0.shape

	x_grid = torch.linspace(0, w - 1, w, device=feature0.device) # [W]

	feature0 = feature0.permute(0, 2, 3, 1) # [B, H, W, C]
	feature1 = feature1.permute(0, 2, 1, 3) # [B, H, C, W]

	correlation = torch.matmul(feature0, feature1) / (c**0.5) # [B, H, W, W]

	# mask subsequent positions to make disparity positive
	mask = torch.triu(torch.ones((w, w)), diagonal=1).type_as(feature0) # [W, W]
	valid_mask = (mask == 0).unsqueeze(0).unsqueeze(0).repeat(b, h, 1, 1) # [B, H, W, W]

	correlation[~valid_mask] = -1e9

	prob = F.softmax(correlation, dim=-1) # [B, H, W, W]

	correspondence = (x_grid.view(1, 1, 1, w) * prob).sum(-1) # [B, H, W]

	# NOTE: unlike flow, disparity is typically positive
	disparity = x_grid.view(1, 1, w).repeat(b, h, 1) - correspondence # [B, H, W]

	return disparity.unsqueeze(1), prob # feature resolution


	def local_correlation_softmax_stereo(
	feature0,
	feature1,
	local_radius,
	):
	b, c, h, w = feature0.size()
	coords_init = coords_grid(b, h, w).to(feature0.device) # [B, 2, H, W]
	coords = coords_init.view(b, 2, -1).permute(0, 2, 1).contiguous() # [B, H*W, 2]

	local_h = 1
	local_w = 2 * local_radius + 1

	window_grid = generate_window_grid(
	0, 0, -local_radius, local_radius, local_h, local_w, device=feature0.device
	) # [1, 2R+1, 2]
	window_grid = window_grid.reshape(-1, 2).repeat(b, 1, 1, 1) # [B, 1, (2R+1), 2]
	sample_coords = coords.unsqueeze(-2) + window_grid # [B, H*W, (2R+1), 2]

	sample_coords_softmax = sample_coords

	# exclude coords that are out of image space
	valid_x = (sample_coords[:, :, :, 0] >= 0) & (
	sample_coords[:, :, :, 0] < w
	) # [B, H*W, (2R+1)^2]
	valid_y = (sample_coords[:, :, :, 1] >= 0) & (
	sample_coords[:, :, :, 1] < h
	) # [B, H*W, (2R+1)^2]

	valid = valid_x & valid_y # [B, H*W, (2R+1)^2], used to mask out invalid values when softmax

	# normalize coordinates to [-1, 1]
	sample_coords_norm = normalize_coords(sample_coords, h, w) # [-1, 1]
	window_feature = F.grid_sample(
	feature1, sample_coords_norm, padding_mode="zeros", align_corners=False
	).permute(
	0, 2, 1, 3
	) # [B, H*W, C, (2R+1)]
	feature0_view = (
	feature0.permute(0, 2, 3, 1).contiguous().view(b, h * w, 1, c)
	) # [B, H*W, 1, C]

	corr = torch.matmul(feature0_view, window_feature).view(b, h * w, -1) / (
	c**0.5
	) # [B, H*W, (2R+1)]

	# mask invalid locations
	corr[~valid] = -1e9

	prob = F.softmax(corr, -1) # [B, H*W, (2R+1)]

	correspondence = (
	torch.matmul(prob.unsqueeze(-2), sample_coords_softmax)
	.squeeze(-2)
	.view(b, h, w, 2)
	.permute(0, 3, 1, 2)
	.contiguous()
	) # [B, 2, H, W]

	flow = correspondence - coords_init # flow at feature resolution
	match_prob = prob

	flow_x = -flow[:, :1] # [B, 1, H, W]

	return flow_x, match_prob


	def correlation_softmax_depth(
	feature0,
	feature1,
	intrinsics,
	pose,
	depth_candidates,
	depth_from_argmax=False,
	pred_bidir_depth=False,
	):
	b, c, h, w = feature0.size()
	assert depth_candidates.dim() == 4 # [B, D, H, W]
	scale_factor = c**0.5

	if pred_bidir_depth:
	feature0, feature1 = torch.cat((feature0, feature1), dim=0), torch.cat(
	(feature1, feature0), dim=0
	)
	intrinsics = intrinsics.repeat(2, 1, 1)
	pose = torch.cat((pose, torch.inverse(pose)), dim=0)
	depth_candidates = depth_candidates.repeat(2, 1, 1, 1)

	# depth candidates are actually inverse depth
	warped_feature1 = warp_with_pose_depth_candidates(
	feature1,
	intrinsics,
	pose,
	1.0 / depth_candidates,
	) # [B, C, D, H, W]

	correlation = (feature0.unsqueeze(2) * warped_feature1).sum(1) / scale_factor # [B, D, H, W]

	match_prob = F.softmax(correlation, dim=1) # [B, D, H, W]

	# for cross-task transfer (flow -> depth), extract depth with argmax at test time
	if depth_from_argmax:
	index = torch.argmax(match_prob, dim=1, keepdim=True)
	depth = torch.gather(depth_candidates, dim=1, index=index)
	else:
	depth = (match_prob * depth_candidates).sum(dim=1, keepdim=True) # [B, 1, H, W]

	return depth, match_prob


	def warp_with_pose_depth_candidates(
	feature1,
	intrinsics,
	pose,
	depth,
	clamp_min_depth=1e-3,
	):
	"""
	feature1: [B, C, H, W]
	intrinsics: [B, 3, 3]
	pose: [B, 4, 4]
	depth: [B, D, H, W]
	"""

	assert intrinsics.size(1) == intrinsics.size(2) == 3
	assert pose.size(1) == pose.size(2) == 4
	assert depth.dim() == 4

	b, d, h, w = depth.size()
	c = feature1.size(1)

	with torch.no_grad():
	# pixel coordinates
	grid = coords_grid(b, h, w, homogeneous=True, device=depth.device) # [B, 3, H, W]
	# back project to 3D and transform viewpoint
	points = torch.inverse(intrinsics).bmm(grid.view(b, 3, -1)) # [B, 3, H*W]
	points = torch.bmm(pose[:, :3, :3], points).unsqueeze(2).repeat(1, 1, d, 1) * depth.view(
	b, 1, d, h * w
	) # [B, 3, D, H*W]
	points = points + pose[:, :3, -1:].unsqueeze(-1) # [B, 3, D, H*W]
	# reproject to 2D image plane
	points = torch.bmm(intrinsics, points.view(b, 3, -1)).view(
	b, 3, d, h * w
	) # [B, 3, D, H*W]
	pixel_coords = points[:, :2] / points[:, -1:].clamp(min=clamp_min_depth) # [B, 2, D, H*W]

	# normalize to [-1, 1]
	x_grid = 2 * pixel_coords[:, 0] / (w - 1) - 1
	y_grid = 2 * pixel_coords[:, 1] / (h - 1) - 1

	grid = torch.stack([x_grid, y_grid], dim=-1) # [B, D, H*W, 2]

	# sample features
	warped_feature = F.grid_sample(
	feature1,
	grid.view(b, d * h, w, 2),
	mode="bilinear",
	padding_mode="zeros",
	align_corners=False,
	).view(
	b, c, d, h, w
	) # [B, C, D, H, W]

	return warped_feature