Spaces:

osherr
/

Prior2DSM

Running

App Files Files Community

Prior2DSM / src /dinov3 /eval /segmentation /inference.py

osherr

Upload 222 files

bc90483 verified 5 months ago

raw

history blame contribute delete

8.3 kB

	# Copyright (c) Meta Platforms, Inc. and affiliates.
	#
	# This software may be used and distributed in accordance with
	# the terms of the DINOv3 License Agreement.

	from typing import Callable, Optional, Tuple

	import torch
	import torch.distributed as dist
	import torch.nn.functional as F
	from torch import nn
	from torchvision.transforms import functional as Fv

	import dinov3.distributed as distributed


	def precompute_forward_number_for_sliding_inference(
	test_dataloader,
	dataset_len: int,
	eval_crop_size: int,
	eval_stride: int,
	):
	image_crop_nums = torch.zeros(dataset_len, device=distributed.get_rank(), dtype=torch.int8)
	print("Computing the number of forwards for sliding window evaluation")
	for batch_img, target in test_dataloader:
	# Dataset is wrapped in DatasetWithEnumeratedTargets
	# and has index information
	index, _ = target
	# Only keep samples with non-negative indices
	if index.item() < 0:
	continue
	batch_image_crops = []
	for img in batch_img:
	# Compute the number of crops to create (thus the number of forwards to do for each image)
	h_stride, w_stride = eval_stride, eval_stride # type: ignore
	h_crop, w_crop = eval_crop_size, eval_crop_size # type: ignore
	h_img, w_img = img.shape[-2:]
	h_grids = max(h_img - h_crop + h_stride - 1, 0) // h_stride + 1 # type: ignore
	w_grids = max(w_img - w_crop + w_stride - 1, 0) // w_stride + 1 # type: ignore
	batch_image_crops.append(h_grids * w_grids) # number of crops
	image_crop_nums[index.item()] = max(batch_image_crops) # add information to the global tensor
	dist.all_reduce(image_crop_nums, op=dist.ReduceOp.MAX)
	return torch.max(image_crop_nums).item()


	def make_inference(
	x: torch.Tensor,
	segmentation_model: nn.Module,
	inference_mode: str = "whole",
	decoder_head_type: str = "linear",
	rescale_to=(512, 512),
	n_output_channels: int = 256,
	crop_size: Optional[Tuple[int]] = None,
	stride: Optional[Tuple[int]] = None,
	apply_horizontal_flip: bool = False,
	num_max_forward: int = 1,
	output_activation: Callable \| None = None,
	):
	"""Make inference on a given image, and reverts horizontal flip TTA if applicable.
	If `inference_mode` = whole, one single prediction is made for the image.
	If `inference_mode` = slide, the image is cropped into multiple slices and the latter are
	used to make prediction following a sliding window method.

	Args:
	x (tensor): input image to make inference on.
	dense_predictor (nn.Module): model to use for evaluating on dense tasks.
	requires a `predict` method.
	inference_mode (str, optional): Do inference on the whole image (mode="whole"), or by
	adopting a sliding window approach to aggregate the results on
	smaller patches of the input image (mode="slide"). Defaults to "whole".
	rescale_to (tuple, optional): Resizing the output of the model prediction to the
	shape of the ground truth. Defaults to (512, 512).
	n_output_channels (int): number of output classes
	crop_size (tuple, optional): [h_crop, w_crop]
	stride (tuple, optional): [h_stride, w_stride]
	apply_horizontal_flip (bool): Determines if horizontal flip TTA was applied for
	the prediction. Defaults to False.
	output_activation (callable): Output activation to use on top of the predictions.
	- softmax is used when each pixel belongs to a single class (multiclass),
	- sigmoid is used when pixel can belong to multiple classes (multilabel). Defaults to None (identity).
	Returns:
	Tensor: The segmentation results created from the input image.
	"""
	assert inference_mode in ["whole", "slide"]
	if inference_mode == "slide":
	# crop size and stride are needed for sliding inference
	assert crop_size is not None
	assert stride is not None
	pred = F.interpolate(
	slide_inference(
	x,
	segmentation_model,
	decoder_head_type,
	n_output_channels=n_output_channels,
	crop_size=crop_size,
	stride=stride,
	num_max_forward=num_max_forward,
	),
	size=rescale_to,
	mode="bilinear",
	align_corners=False,
	)
	else:
	pred = segmentation_model.predict(
	F.interpolate(
	x,
	size=(512, 512),
	mode="bilinear",
	align_corners=False,
	),
	rescale_to=rescale_to,
	)
	if decoder_head_type == "m2f":
	mask_pred, mask_cls = pred["pred_masks"], pred["pred_logits"]
	mask_cls = F.softmax(mask_cls, dim=-1)[..., :-1]
	mask_pred = mask_pred.sigmoid()
	pred = torch.einsum("bqc,bqhw->bchw", mask_cls.to(torch.float), mask_pred.to(torch.float))
	if apply_horizontal_flip:
	pred = Fv.hflip(pred)
	if output_activation:
	pred = output_activation(pred)
	return pred


	def slide_inference(
	inputs: torch.Tensor,
	segmentation_model: nn.Module,
	decoder_head_type: str = "linear",
	n_output_channels: int = 256,
	crop_size: Tuple = (512, 512),
	stride: Tuple = (341, 341),
	num_max_forward: int = 1,
	):
	"""Inference by sliding-window with overlap.
	If h_crop > h_img or w_crop > w_img, the small patch will be used to
	decode without padding.
	Args:
	inputs (tensor): the tensor should have a shape NxCxHxW,
	which contains all images in the batch.
	segmentation_model (nn.Module): model to use for evaluating on dense tasks.
	n_output_channels (int): number of output channels
	crop_size (tuple): (h_crop, w_crop)
	stride (tuple): (h_stride, w_stride)
	Returns:
	Tensor: The output results from model of each input image.
	"""
	h_stride, w_stride = stride
	h_crop, w_crop = crop_size
	batch_size, C, h_img, w_img = inputs.shape
	if h_crop > h_img and w_crop > w_img: # Meaning we are doing < 1.0 TTA
	h_crop, w_crop = min(h_img, w_img), min(h_img, w_img)
	assert batch_size == 1 # As of now, the code assumes that a single image is passed at a time at inference time
	h_grids = max(h_img - h_crop + h_stride - 1, 0) // h_stride + 1
	w_grids = max(w_img - w_crop + w_stride - 1, 0) // w_stride + 1
	preds = inputs.new_zeros((1, n_output_channels, h_img, w_img)).cpu()
	count_mat = inputs.new_zeros((1, 1, h_img, w_img)).to(torch.int8).cpu()
	for h_idx in range(h_grids):
	for w_idx in range(w_grids):
	y1 = h_idx * h_stride
	x1 = w_idx * w_stride
	y2 = min(y1 + h_crop, h_img)
	x2 = min(x1 + w_crop, w_img)
	y1 = max(y2 - h_crop, 0)
	x1 = max(x2 - w_crop, 0)
	crop_img = inputs[:, :, y1:y2, x1:x2]
	crop_pred = segmentation_model.predict(crop_img, rescale_to=crop_img.shape[2:])
	if decoder_head_type == "m2f":
	mask_pred, mask_cls = crop_pred["pred_masks"], crop_pred["pred_logits"]
	mask_cls = F.softmax(mask_cls, dim=-1)[..., :-1]
	mask_pred = mask_pred.sigmoid()
	crop_pred = torch.einsum("bqc,bqhw->bchw", mask_cls.to(torch.bfloat16), mask_pred.to(torch.bfloat16))
	del mask_cls, mask_pred
	preds += F.pad(crop_pred, (int(x1), int(preds.shape[-1] - x2), int(y1), int(preds.shape[-2] - y2))).cpu()
	count_mat[:, :, y1:y2, x1:x2] += 1
	del crop_img, crop_pred
	# Optional buffer to ensure each gpu does the same number of operations for sharded models
	for _ in range(h_grids * w_grids, num_max_forward):
	dummy_input = inputs.new_zeros((1, C, h_crop, w_crop))
	_ = segmentation_model.predict(dummy_input, rescale_to=dummy_input.shape[2:])
	assert (count_mat == 0).sum() == 0
	return preds / count_mat