ASL-MoViNet-T5-translator

Sleeping

App Files Files Community

ASL-MoViNet-T5-translator / official /vision /dataloaders /segmentation_input.py

deanna-emery

updates

93528c6 over 2 years ago

raw

history blame contribute delete

11.8 kB

	# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""Data parser and processing for segmentation datasets."""

	import tensorflow as tf, tf_keras
	from official.vision.configs import semantic_segmentation as config_lib
	from official.vision.dataloaders import decoder
	from official.vision.dataloaders import parser
	from official.vision.dataloaders import utils
	from official.vision.ops import preprocess_ops


	class Decoder(decoder.Decoder):
	"""A tf.Example decoder for segmentation task."""

	def __init__(self,
	image_feature=config_lib.DenseFeatureConfig(),
	additional_dense_features=None):
	self._keys_to_features = {
	'image/encoded':
	tf.io.FixedLenFeature((), tf.string, default_value=''),
	'image/height':
	tf.io.FixedLenFeature((), tf.int64, default_value=0),
	'image/width':
	tf.io.FixedLenFeature((), tf.int64, default_value=0),
	'image/segmentation/class/encoded':
	tf.io.FixedLenFeature((), tf.string, default_value=''),
	image_feature.feature_name:
	tf.io.FixedLenFeature((), tf.string, default_value='')
	}
	if additional_dense_features:
	for feature in additional_dense_features:
	self._keys_to_features[feature.feature_name] = tf.io.FixedLenFeature(
	(), tf.string, default_value='')

	def decode(self, serialized_example):
	return tf.io.parse_single_example(serialized_example,
	self._keys_to_features)


	class Parser(parser.Parser):
	"""Parser to parse an image and its annotations into a dictionary of tensors."""

	def __init__(self,
	output_size,
	crop_size=None,
	resize_eval_groundtruth=True,
	gt_is_matting_map=False,
	groundtruth_padded_size=None,
	ignore_label=255,
	aug_rand_hflip=False,
	preserve_aspect_ratio=True,
	aug_scale_min=1.0,
	aug_scale_max=1.0,
	dtype='float32',
	image_feature=config_lib.DenseFeatureConfig(),
	additional_dense_features=None):
	"""Initializes parameters for parsing annotations in the dataset.

	Args:
	output_size: `Tensor` or `list` for [height, width] of output image. The
	output_size should be divided by the largest feature stride 2^max_level.
	crop_size: `Tensor` or `list` for [height, width] of the crop. If
	specified a training crop of size crop_size is returned. This is useful
	for cropping original images during training while evaluating on
	original image sizes.
	resize_eval_groundtruth: `bool`, if True, eval ground-truth masks are
	resized to output_size.
	gt_is_matting_map: `bool`, if True, the expected mask is in the range
	between 0 and 255. The parser will normalize the value of the mask into
	the range between 0 and 1.
	groundtruth_padded_size: `Tensor` or `list` for [height, width]. When
	resize_eval_groundtruth is set to False, the ground-truth masks are
	padded to this size.
	ignore_label: `int` the pixel with ignore label will not used for training
	and evaluation.
	aug_rand_hflip: `bool`, if True, augment training with random horizontal
	flip.
	preserve_aspect_ratio: `bool`, if True, the aspect ratio is preserved,
	otherwise, the image is resized to output_size.
	aug_scale_min: `float`, the minimum scale applied to `output_size` for
	data augmentation during training.
	aug_scale_max: `float`, the maximum scale applied to `output_size` for
	data augmentation during training.
	dtype: `str`, data type. One of {`bfloat16`, `float32`, `float16`}.
	image_feature: the config for the image input (usually RGB). Defaults to
	the config for a 3-channel image with key = `image/encoded` and ImageNet
	dataset mean/stddev.
	additional_dense_features: `list` of DenseFeatureConfig for additional
	dense features.
	"""
	self._output_size = output_size
	self._crop_size = crop_size
	self._resize_eval_groundtruth = resize_eval_groundtruth
	if (not resize_eval_groundtruth) and (groundtruth_padded_size is None):
	raise ValueError('groundtruth_padded_size ([height, width]) needs to be'
	'specified when resize_eval_groundtruth is False.')
	self._gt_is_matting_map = gt_is_matting_map
	self._groundtruth_padded_size = groundtruth_padded_size
	self._ignore_label = ignore_label
	self._preserve_aspect_ratio = preserve_aspect_ratio

	# Data augmentation.
	self._aug_rand_hflip = aug_rand_hflip
	self._aug_scale_min = aug_scale_min
	self._aug_scale_max = aug_scale_max

	# dtype.
	self._dtype = dtype

	self._image_feature = image_feature
	self._additional_dense_features = additional_dense_features

	def _prepare_image_and_label(self, data):
	"""Prepare normalized image and label."""
	height = data['image/height']
	width = data['image/width']

	label = tf.io.decode_image(
	data['image/segmentation/class/encoded'], channels=1)
	label = tf.reshape(label, (1, height, width))
	label = tf.cast(label, tf.float32)

	image = tf.io.decode_image(
	data[self._image_feature.feature_name],
	channels=self._image_feature.num_channels,
	dtype=tf.uint8)
	image = tf.reshape(image, (height, width, self._image_feature.num_channels))
	# Normalizes the image feature with mean and std values, which are divided
	# by 255 because an uint8 image are re-scaled automatically. Images other
	# than uint8 type will be wrongly normalized.
	image = preprocess_ops.normalize_image(
	image, [mean / 255.0 for mean in self._image_feature.mean],
	[stddev / 255.0 for stddev in self._image_feature.stddev])

	if self._additional_dense_features:
	input_list = [image]
	for feature_cfg in self._additional_dense_features:
	feature = tf.io.decode_image(
	data[feature_cfg.feature_name],
	channels=feature_cfg.num_channels,
	dtype=tf.uint8)
	feature = tf.reshape(feature, (height, width, feature_cfg.num_channels))
	feature = preprocess_ops.normalize_image(
	feature, [mean / 255.0 for mean in feature_cfg.mean],
	[stddev / 255.0 for stddev in feature_cfg.stddev])
	input_list.append(feature)
	concat_input = tf.concat(input_list, axis=2)
	else:
	concat_input = image

	if not self._preserve_aspect_ratio:
	label = tf.reshape(label, [data['image/height'], data['image/width'], 1])
	concat_input = tf.image.resize(
	concat_input, self._output_size, method='bilinear')
	label = tf.image.resize(label, self._output_size, method='nearest')
	label = tf.reshape(label[:, :, -1], [1] + self._output_size)

	return concat_input, label

	def _parse_train_data(self, data):
	"""Parses data for training and evaluation."""
	image, label = self._prepare_image_and_label(data)

	# Normalize the label into the range of 0 and 1 for matting ground-truth.
	# Note that the input ground-truth labels must be 0 to 255, and do not
	# contain ignore_label. For gt_is_matting_map case, ignore_label is only
	# used for padding the labels.
	if self._gt_is_matting_map:
	scale = tf.constant(255.0, dtype=tf.float32)
	scale = tf.expand_dims(scale, axis=0)
	scale = tf.expand_dims(scale, axis=0)
	label = tf.cast(label, tf.float32) / scale

	if self._crop_size:

	label = tf.reshape(label, [data['image/height'], data['image/width'], 1])
	# If output_size is specified, resize image, and label to desired
	# output_size.
	if self._output_size:
	image = tf.image.resize(image, self._output_size, method='bilinear')
	label = tf.image.resize(label, self._output_size, method='nearest')

	image_mask = tf.concat([image, label], axis=2)
	image_mask_crop = tf.image.random_crop(
	image_mask, self._crop_size + [tf.shape(image_mask)[-1]])
	image = image_mask_crop[:, :, :-1]
	label = tf.reshape(image_mask_crop[:, :, -1], [1] + self._crop_size)

	# Flips image randomly during training.
	if self._aug_rand_hflip:
	image, _, label = preprocess_ops.random_horizontal_flip(
	image, masks=label)

	train_image_size = self._crop_size if self._crop_size else self._output_size
	# Resizes and crops image.
	image, image_info = preprocess_ops.resize_and_crop_image(
	image,
	train_image_size,
	train_image_size,
	aug_scale_min=self._aug_scale_min,
	aug_scale_max=self._aug_scale_max)

	# Resizes and crops boxes.
	image_scale = image_info[2, :]
	offset = image_info[3, :]

	# Pad label and make sure the padded region assigned to the ignore label.
	# The label is first offset by +1 and then padded with 0.
	label += 1
	label = tf.expand_dims(label, axis=3)
	label = preprocess_ops.resize_and_crop_masks(label, image_scale,
	train_image_size, offset)
	label -= 1
	label = tf.where(
	tf.equal(label, -1), self._ignore_label * tf.ones_like(label), label)
	label = tf.squeeze(label, axis=0)
	valid_mask = tf.not_equal(label, self._ignore_label)

	labels = {
	'masks': label,
	'valid_masks': valid_mask,
	'image_info': image_info,
	}

	# Cast image as self._dtype
	image = tf.cast(image, dtype=self._dtype)

	return image, labels

	def _parse_eval_data(self, data):
	"""Parses data for training and evaluation."""
	image, label = self._prepare_image_and_label(data)

	# Binarize mask if ground-truth is a matting map
	if self._gt_is_matting_map:
	label = tf.divide(tf.cast(label, dtype=tf.float32), 255.0)
	label = utils.binarize_matting_map(label)

	# The label is first offset by +1 and then padded with 0.
	label += 1
	label = tf.expand_dims(label, axis=3)

	# Resizes and crops image.
	image, image_info = preprocess_ops.resize_and_crop_image(
	image, self._output_size, self._output_size)

	if self._resize_eval_groundtruth:
	# Resizes eval masks to match input image sizes. In that case, mean IoU
	# is computed on output_size not the original size of the images.
	image_scale = image_info[2, :]
	offset = image_info[3, :]
	label = preprocess_ops.resize_and_crop_masks(label, image_scale,
	self._output_size, offset)
	else:
	label = tf.image.pad_to_bounding_box(label, 0, 0,
	self._groundtruth_padded_size[0],
	self._groundtruth_padded_size[1])

	label -= 1
	label = tf.where(
	tf.equal(label, -1), self._ignore_label * tf.ones_like(label), label)
	label = tf.squeeze(label, axis=0)

	valid_mask = tf.not_equal(label, self._ignore_label)
	labels = {
	'masks': label,
	'valid_masks': valid_mask,
	'image_info': image_info
	}

	# Cast image as self._dtype
	image = tf.cast(image, dtype=self._dtype)

	return image, labels