Upload processor

Browse files

Files changed (2) hide show

image_processor.py +257 -0
preprocessor_config.json +22 -0

image_processor.py ADDED Viewed

	@@ -0,0 +1,257 @@

+from typing import Dict, List, Optional, Tuple, Union, Iterable
+import numpy as np
+import torch
+import transformers
+from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
+from transformers.image_transforms import (
+    ChannelDimension,
+    get_resize_output_image_size,
+    rescale,
+    resize,
+    to_channel_dimension_format,
+)
+from transformers.image_utils import (
+    ImageInput,
+    PILImageResampling,
+    infer_channel_dimension_format,
+    get_channel_dimension_axis,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+)
+from transformers.utils import is_torch_tensor
+class FaceSegformerImageProcessor(BaseImageProcessor):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.image_size = kwargs.get("image_size", (224, 224))
+        self.normalize_mean = kwargs.get("normalize_mean", [0.485, 0.456, 0.406])
+        self.normalize_std = kwargs.get("normalize_std", [0.229, 0.224, 0.225])
+        self.resample = kwargs.get("resample", PILImageResampling.BILINEAR)
+        self.data_format = kwargs.get("data_format", ChannelDimension.FIRST)
+    @staticmethod
+    def normalize(
+        image: np.ndarray,
+        mean: Union[float, Iterable[float]],
+        std: Union[float, Iterable[float]],
+        max_pixel_value: float = 255.0,
+        data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """
+        Copied from:
+        https://github.com/huggingface/transformers/blob/3eddda1111f70f3a59485e08540e8262b927e867/src/transformers/image_transforms.py#L209
+        BUT uses the formula from albumentations:
+        https://albumentations.ai/docs/api_reference/augmentations/transforms/#albumentations.augmentations.transforms.Normalize
+        img = (img - mean * max_pixel_value) / (std * max_pixel_value)
+        """
+        if not isinstance(image, np.ndarray):
+            raise ValueError("image must be a numpy array")
+        if input_data_format is None:
+            input_data_format = infer_channel_dimension_format(image)
+        channel_axis = get_channel_dimension_axis(
+            image, input_data_format=input_data_format
+        )
+        num_channels = image.shape[channel_axis]
+        # We cast to float32 to avoid errors that can occur when subtracting uint8 values.
+        # We preserve the original dtype if it is a float type to prevent upcasting float16.
+        if not np.issubdtype(image.dtype, np.floating):
+            image = image.astype(np.float32)
+        if isinstance(mean, Iterable):
+            if len(mean) != num_channels:
+                raise ValueError(
+                    f"mean must have {num_channels} elements if it is an iterable, got {len(mean)}"
+                )
+        else:
+            mean = [mean] * num_channels
+        mean = np.array(mean, dtype=image.dtype)
+        if isinstance(std, Iterable):
+            if len(std) != num_channels:
+                raise ValueError(
+                    f"std must have {num_channels} elements if it is an iterable, got {len(std)}"
+                )
+        else:
+            std = [std] * num_channels
+        std = np.array(std, dtype=image.dtype)
+        # Uses max_pixel_value for normalization
+        if input_data_format == ChannelDimension.LAST:
+            image = (image - mean * max_pixel_value) / (std * max_pixel_value)
+        else:
+            image = ((image.T - mean * max_pixel_value) / (std * max_pixel_value)).T
+        image = (
+            to_channel_dimension_format(image, data_format, input_data_format)
+            if data_format is not None
+            else image
+        )
+        return image
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Copied from:
+        https://github.com/huggingface/transformers/blob/3eddda1111f70f3a59485e08540e8262b927e867/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py
+        """
+        default_to_square = True
+        if "shortest_edge" in size:
+            size = size["shortest_edge"]
+            default_to_square = False
+        elif "height" in size and "width" in size:
+            size = (size["height"], size["width"])
+        else:
+            raise ValueError(
+                "Size must contain either 'shortest_edge' or 'height' and 'width'."
+            )
+        output_size = get_resize_output_image_size(
+            image,
+            size=size,
+            default_to_square=default_to_square,
+            input_data_format=input_data_format,
+        )
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+    def __call__(self, images: ImageInput, masks: ImageInput = None, **kwargs):
+        """
+        Adapted from:
+        https://github.com/huggingface/transformers/blob/3eddda1111f70f3a59485e08540e8262b927e867/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py
+        """
+        # single to iterable if needed
+        images = make_list_of_images(images)
+        # validate
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        # make numpy arrays
+        images = [to_numpy_array(image) for image in images]
+        # get channel dimensions
+        input_data_format = kwargs.get("input_data_format")
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+        # check if training
+        # todo: can also assume if masks are passed that we are doing training?
+        if kwargs.get("do_training", False) is True:
+            if mask is None:
+                raise ValueError("must pass masks if doing training.")
+            # todo: implement this soon.
+            raise NotImplementedError("not yet implemented.")
+            # Assume we want to do all transformations for training
+        else:
+            # do transformations for inference...
+            images = [
+                self.resize(
+                    image=image,
+                    size={
+                        "shortest_edge": min(
+                            kwargs.get("image_size") or self.image_size
+                        )
+                    },
+                    resample=kwargs.get("resample") or self.resample,
+                    input_data_format=input_data_format,
+                )
+                for image in images
+            ]
+            images = [
+                self.normalize(
+                    image=image,
+                    mean=kwargs.get("normalize_mean") or self.normalize_mean,
+                    std=kwargs.get("normalize_std") or self.normalize_std,
+                    input_data_format=input_data_format,
+                )
+                for image in images
+            ]
+        # fix dimensions
+        images = [
+            to_channel_dimension_format(
+                image,
+                kwargs.get("data_format") or self.data_format,
+                input_channel_dim=input_data_format,
+            )
+            for image in images
+        ]
+        data = {"pixel_values": images}
+        return BatchFeature(data=data, tensor_type="pt")
+    # Copied from transformers.models.segformer.image_processing_segformer.SegformerImageProcessor.post_process_semantic_segmentation
+    def post_process_semantic_segmentation(
+        self, outputs, target_sizes: List[Tuple] = None
+    ):
+        """
+        Converts the output of [`SegformerForSemanticSegmentation`] into semantic segmentation maps. Only supports PyTorch.
+        Args:
+            outputs ([`SegformerForSemanticSegmentation`]):
+                Raw outputs of the model.
+            target_sizes (`List[Tuple]` of length `batch_size`, *optional*):
+                List of tuples corresponding to the requested final size (height, width) of each prediction. If unset,
+                predictions will not be resized.
+        Returns:
+            semantic_segmentation: `List[torch.Tensor]` of length `batch_size`, where each item is a semantic
+            segmentation map of shape (height, width) corresponding to the target_sizes entry (if `target_sizes` is
+            specified). Each entry of each `torch.Tensor` correspond to a semantic class id.
+        """
+        # TODO: add support for other frameworks
+        logits = outputs.logits
+        # Resize logits and compute semantic segmentation maps
+        if target_sizes is not None:
+            if len(logits) != len(target_sizes):
+                raise ValueError(
+                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
+                )
+            if is_torch_tensor(target_sizes):
+                target_sizes = target_sizes.numpy()
+            semantic_segmentation = []
+            for idx in range(len(logits)):
+                resized_logits = torch.nn.functional.interpolate(
+                    logits[idx].unsqueeze(dim=0),
+                    size=target_sizes[idx],
+                    mode="bilinear",
+                    align_corners=False,
+                )
+                semantic_map = resized_logits[0].argmax(dim=0)
+                semantic_segmentation.append(semantic_map)
+        else:
+            semantic_segmentation = logits.argmax(dim=1)
+            semantic_segmentation = [
+                semantic_segmentation[i] for i in range(semantic_segmentation.shape[0])
+            ]
+        return semantic_segmentation

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "auto_map": {
+    "AutoImageProcessor": "image_processor.FaceSegformerImageProcessor"
+  },
+  "data_format": "channels_first",
+  "image_processor_type": "FaceSegformerImageProcessor",
+  "image_size": [
+    224,
+    224
+  ],
+  "normalize_mean": [
+    0.485,
+    0.456,
+    0.406
+  ],
+  "normalize_std": [
+    0.229,
+    0.224,
+    0.225
+  ],
+  "resample": 2
+}