MVANet / image_processing_mvanet.py

Upload processor

f685c19 verified 22 days ago

9.32 kB

	"""Image processor for MVANet model."""

	from typing import Dict, List, Optional, Tuple, Union

	import numpy as np
	import torch
	import torch.nn.functional as F
	from PIL import Image
	from transformers import BaseImageProcessor
	from transformers.image_processing_utils import BatchFeature
	from transformers.image_utils import (
	ImageInput,
	PILImageResampling,
	)
	from transformers.utils import TensorType


	def to_pil_image(image: Union[np.ndarray, torch.Tensor, Image.Image]) -> Image.Image:
	"""Convert various image formats to PIL Image."""
	if isinstance(image, Image.Image):
	return image
	if isinstance(image, torch.Tensor):
	# (C, H, W) tensor
	if image.ndim == 3 and image.shape[0] in [1, 3, 4]:
	image = image.permute(1, 2, 0).cpu().numpy()
	image = (image * 255).clip(0, 255).astype(np.uint8)
	if isinstance(image, np.ndarray):
	if image.ndim == 2:
	# Grayscale
	return Image.fromarray(image, mode="L")
	elif image.ndim == 3:
	if image.shape[2] == 1:
	return Image.fromarray(image.squeeze(2), mode="L")
	elif image.shape[2] == 3:
	return Image.fromarray(image, mode="RGB")
	elif image.shape[2] == 4:
	return Image.fromarray(image, mode="RGBA")
	raise ValueError(f"Unsupported image type: {type(image)}")


	class MVANetImageProcessor(BaseImageProcessor):
	"""
	Constructs a MVANet image processor.

	Args:
	do_resize (:obj:`bool`, `optional`, defaults to :obj:`True`):
	Whether to resize the image.
	size (:obj:`Dict[str, int]`, `optional`, defaults to :obj:`{"height": 1024, "width": 1024}`):
	Target size for resizing. MVANet was trained on 1024x1024 images.
	resample (:obj:`PILImageResampling`, `optional`, defaults to :obj:`PILImageResampling.BILINEAR`):
	Resampling filter to use when resizing the image.
	do_normalize (:obj:`bool`, `optional`, defaults to :obj:`True`):
	Whether to normalize the image.
	image_mean (:obj:`List[float]`, `optional`, defaults to :obj:`[0.485, 0.456, 0.406]`):
	Mean to use for normalization (ImageNet mean).
	image_std (:obj:`List[float]`, `optional`, defaults to :obj:`[0.229, 0.224, 0.225]`):
	Standard deviation to use for normalization (ImageNet std).
	"""

	model_input_names = ["pixel_values"]

	def __init__(
	self,
	do_resize: bool = True,
	size: Optional[Dict[str, int]] = None,
	resample: PILImageResampling = PILImageResampling.BILINEAR,
	do_normalize: bool = True,
	image_mean: Optional[List[float]] = None,
	image_std: Optional[List[float]] = None,
	**kwargs,
	):
	super().__init__(**kwargs)
	size = size if size is not None else {"height": 1024, "width": 1024}
	self.do_resize = do_resize
	self.size = size
	self.resample = resample
	self.do_normalize = do_normalize
	self.image_mean = (
	image_mean if image_mean is not None else [0.485, 0.456, 0.406]
	)
	self.image_std = image_std if image_std is not None else [0.229, 0.224, 0.225]

	def resize(
	self,
	image: Image.Image,
	size: Dict[str, int],
	resample: PILImageResampling = PILImageResampling.BILINEAR,
	) -> Image.Image:
	"""Resize image to target size."""
	target_height = size["height"]
	target_width = size["width"]
	return image.resize((target_width, target_height), resample)

	def normalize(
	self,
	image: np.ndarray,
	mean: List[float],
	std: List[float],
	) -> np.ndarray:
	"""Normalize image with mean and std."""
	image = image.astype(np.float32) / 255.0
	mean = np.array(mean, dtype=np.float32)
	std = np.array(std, dtype=np.float32)
	image = (image - mean) / std
	return image

	def preprocess(
	self,
	images: ImageInput,
	do_resize: Optional[bool] = None,
	size: Optional[Dict[str, int]] = None,
	resample: Optional[PILImageResampling] = None,
	do_normalize: Optional[bool] = None,
	image_mean: Optional[List[float]] = None,
	image_std: Optional[List[float]] = None,
	return_tensors: Optional[Union[str, TensorType]] = None,
	**kwargs,
	) -> BatchFeature:
	"""
	Preprocess images for MVANet.

	Args:
	images (:obj:`ImageInput`):
	Images to preprocess. Can be a single image or a batch of images.
	do_resize (:obj:`bool`, `optional`):
	Whether to resize the image(s). Defaults to :obj:`self.do_resize`.
	size (:obj:`Dict[str, int]`, `optional`):
	Target size for resizing. Defaults to :obj:`self.size`.
	resample (:obj:`PILImageResampling`, `optional`):
	Resampling filter to use. Defaults to :obj:`self.resample`.
	do_normalize (:obj:`bool`, `optional`):
	Whether to normalize the image(s). Defaults to :obj:`self.do_normalize`.
	image_mean (:obj:`List[float]`, `optional`):
	Mean for normalization. Defaults to :obj:`self.image_mean`.
	image_std (:obj:`List[float]`, `optional`):
	Std for normalization. Defaults to :obj:`self.image_std`.
	return_tensors (:obj:`str` or :obj:`TensorType`, `optional`):
	Type of tensors to return. Can be 'pt' for PyTorch.

	Returns:
	:obj:`BatchFeature`: A :obj:`BatchFeature` with the following fields:
	- pixel_values (:obj:`torch.Tensor`): Preprocessed images.
	"""
	# Set defaults
	do_resize = do_resize if do_resize is not None else self.do_resize
	size = size if size is not None else self.size
	resample = resample if resample is not None else self.resample
	do_normalize = do_normalize if do_normalize is not None else self.do_normalize
	image_mean = image_mean if image_mean is not None else self.image_mean
	image_std = image_std if image_std is not None else self.image_std

	# Convert to list if single image
	if not isinstance(images, list):
	images = [images]

	# Convert to PIL Images
	pil_images = []
	# original_sizes = []
	for img in images:
	pil_img = to_pil_image(img)
	# Convert to RGB if not already
	if pil_img.mode != "RGB":
	pil_img = pil_img.convert("RGB")
	# original_sizes.append(pil_img.size) # (width, height)
	pil_images.append(pil_img)

	# Resize
	if do_resize:
	pil_images = [self.resize(img, size, resample) for img in pil_images]

	# Convert to numpy arrays (H, W, C)
	np_images = [np.array(img) for img in pil_images]

	# Normalize
	if do_normalize:
	np_images = [
	self.normalize(img, image_mean, image_std) for img in np_images
	]

	# Convert to (C, H, W) format
	np_images = [img.transpose(2, 0, 1) for img in np_images]

	# Convert to tensors
	if return_tensors == "pt":
	pixel_values = torch.tensor(np.stack(np_images), dtype=torch.float32)
	else:
	pixel_values = np.stack(np_images)

	# Store original sizes as metadata (for post-processing)
	data = {
	"pixel_values": pixel_values,
	# "original_sizes": original_sizes, # List of (width, height) tuples
	}

	return BatchFeature(data=data, tensor_type=return_tensors)

	def post_process_semantic_segmentation(
	self,
	outputs,
	target_sizes: Optional[List[Tuple[int, int]]] = None,
	) -> List[torch.Tensor]:
	"""
	Post-process model outputs to semantic segmentation masks.

	Args:
	outputs (:obj:`SemanticSegmenterOutput` or :obj:`torch.Tensor`):
	Model outputs containing logits.
	target_sizes (:obj:`List[Tuple[int, int]]`, `optional`):
	List of target sizes (width, height) for each image.
	If not provided, returns masks at model output size.

	Returns:
	:obj:`List[torch.Tensor]`: List of segmentation masks (values in [0, 1]).
	"""
	# Extract logits from outputs
	if hasattr(outputs, "logits"):
	logits = outputs.logits
	else:
	logits = outputs

	# Apply sigmoid to get probabilities
	probs = torch.sigmoid(logits) # (B, 1, H, W)

	# Resize to target sizes if provided
	if target_sizes is not None:
	masks = []
	for i, (target_w, target_h) in enumerate(target_sizes):
	mask = F.interpolate(
	probs[i : i + 1],
	size=(target_h, target_w),
	mode="bilinear",
	align_corners=False,
	)
	masks.append(mask.squeeze(0).squeeze(0)) # (H, W)
	return masks

	# Return at original size
	return [
	probs[i].squeeze(0) for i in range(probs.shape[0])
	] # List of (1, H, W) or (H, W)