ASL-MoViNet-T5-translator

Sleeping

App Files Files Community

ASL-MoViNet-T5-translator / official /vision /dataloaders /video_input.py

deanna-emery

updates

93528c6 over 2 years ago

raw

history blame contribute delete

17.3 kB

	# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""Parser for video and label datasets."""

	from typing import Dict, Optional, Tuple, Union

	from absl import logging
	import tensorflow as tf, tf_keras

	from official.vision.configs import video_classification as exp_cfg
	from official.vision.dataloaders import decoder
	from official.vision.dataloaders import parser
	from official.vision.ops import augment
	from official.vision.ops import preprocess_ops_3d

	IMAGE_KEY = 'image/encoded'
	LABEL_KEY = 'clip/label/index'


	def process_image(image: tf.Tensor,
	is_training: bool = True,
	num_frames: int = 32,
	stride: int = 1,
	random_stride_range: int = 0,
	num_test_clips: int = 1,
	min_resize: int = 256,
	crop_size: Union[int, Tuple[int, int]] = 224,
	num_channels: int = 3,
	num_crops: int = 1,
	zero_centering_image: bool = False,
	min_aspect_ratio: float = 0.5,
	max_aspect_ratio: float = 2,
	min_area_ratio: float = 0.49,
	max_area_ratio: float = 1.0,
	augmenter: Optional[augment.ImageAugment] = None,
	seed: Optional[int] = None,
	input_image_format: Optional[str] = 'jpeg') -> tf.Tensor:
	"""Processes a serialized image tensor.

	Args:
	image: Input Tensor of shape [time-steps] and type tf.string of serialized
	frames.
	is_training: Whether or not in training mode. If True, random sample, crop
	and left right flip is used.
	num_frames: Number of frames per sub clip.
	stride: Temporal stride to sample frames.
	random_stride_range: An int indicating the min and max bounds to uniformly
	sample different strides from the video. E.g., a value of 1 with stride=2
	will uniformly sample a stride in {1, 2, 3} for each video in a batch.
	Only used enabled training for the purposes of frame-rate augmentation.
	Defaults to 0, which disables random sampling.
	num_test_clips: Number of test clips (1 by default). If more than 1, this
	will sample multiple linearly spaced clips within each video at test time.
	If 1, then a single clip in the middle of the video is sampled. The clips
	are aggregated in the batch dimension.
	min_resize: Frames are resized so that min(height, width) is min_resize.
	crop_size: Final size of the frame after cropping the resized frames.
	Optionally, specify a tuple of (crop_height, crop_width) if
	crop_height != crop_width.
	num_channels: Number of channels of the clip.
	num_crops: Number of crops to perform on the resized frames.
	zero_centering_image: If True, frames are normalized to values in [-1, 1].
	If False, values in [0, 1].
	min_aspect_ratio: The minimum aspect range for cropping.
	max_aspect_ratio: The maximum aspect range for cropping.
	min_area_ratio: The minimum area range for cropping.
	max_area_ratio: The maximum area range for cropping.
	augmenter: Image augmenter to distort each image.
	seed: A deterministic seed to use when sampling.
	input_image_format: The format of input image which could be jpeg, png or
	none for unknown or mixed datasets.

	Returns:
	Processed frames. Tensor of shape
	[num_frames * num_test_clips, crop_height, crop_width, num_channels].
	"""
	# Validate parameters.
	if is_training and num_test_clips != 1:
	logging.warning(
	'`num_test_clips` %d is ignored since `is_training` is `True`.',
	num_test_clips)

	if random_stride_range < 0:
	raise ValueError('Random stride range should be >= 0, got {}'.format(
	random_stride_range))

	if input_image_format not in ('jpeg', 'png', 'none'):
	raise ValueError('Unknown input image format: {}'.format(
	input_image_format))

	if isinstance(crop_size, int):
	crop_size = (crop_size, crop_size)
	crop_height, crop_width = crop_size

	# Temporal sampler.
	if is_training:
	if random_stride_range > 0:
	# Uniformly sample different frame-rates
	stride = tf.random.uniform(
	[],
	tf.maximum(stride - random_stride_range, 1),
	stride + random_stride_range,
	dtype=tf.int32)

	# Sample random clip.
	image = preprocess_ops_3d.sample_sequence(image, num_frames, True, stride,
	seed)
	elif num_test_clips > 1:
	# Sample linspace clips.
	image = preprocess_ops_3d.sample_linspace_sequence(image, num_test_clips,
	num_frames, stride)
	else:
	# Sample middle clip.
	image = preprocess_ops_3d.sample_sequence(image, num_frames, False, stride)

	# Decode JPEG string to tf.uint8.
	if image.dtype == tf.string:
	image = preprocess_ops_3d.decode_image(image, num_channels)

	if is_training:
	# Standard image data augmentation: random resized crop and random flip.
	image = preprocess_ops_3d.random_crop_resize(
	image, crop_height, crop_width, num_frames, num_channels,
	(min_aspect_ratio, max_aspect_ratio),
	(min_area_ratio, max_area_ratio))
	image = preprocess_ops_3d.random_flip_left_right(image, seed)

	if augmenter is not None:
	image = augmenter.distort(image)
	else:
	# Resize images (resize happens only if necessary to save compute).
	image = preprocess_ops_3d.resize_smallest(image, min_resize)
	# Crop of the frames.
	image = preprocess_ops_3d.crop_image(image, crop_height, crop_width, False,
	num_crops)

	# Cast the frames in float32, normalizing according to zero_centering_image.
	return preprocess_ops_3d.normalize_image(image, zero_centering_image)


	def postprocess_image(image: tf.Tensor,
	is_training: bool = True,
	num_frames: int = 32,
	num_test_clips: int = 1,
	num_test_crops: int = 1) -> tf.Tensor:
	"""Processes a batched Tensor of frames.

	The same parameters used in process should be used here.

	Args:
	image: Input Tensor of shape [batch, time-steps, height, width, 3].
	is_training: Whether or not in training mode. If True, random sample, crop
	and left right flip is used.
	num_frames: Number of frames per sub clip.
	num_test_clips: Number of test clips (1 by default). If more than 1, this
	will sample multiple linearly spaced clips within each video at test time.
	If 1, then a single clip in the middle of the video is sampled. The clips
	are aggregated in the batch dimension.
	num_test_crops: Number of test crops (1 by default). If more than 1, there
	are multiple crops for each clip at test time. If 1, there is a single
	central crop. The crops are aggregated in the batch dimension.

	Returns:
	Processed frames. Tensor of shape
	[batch * num_test_clips * num_test_crops, num_frames, height, width, 3].
	"""
	num_views = num_test_clips * num_test_crops
	if num_views > 1 and not is_training:
	# In this case, multiple views are merged together in batch dimension which
	# will be batch * num_views.
	image = tf.reshape(image, [-1, num_frames] + image.shape[2:].as_list())

	return image


	def process_label(label: tf.Tensor,
	one_hot_label: bool = True,
	num_classes: Optional[int] = None,
	label_dtype: tf.DType = tf.int32) -> tf.Tensor:
	"""Processes label Tensor."""
	# Validate parameters.
	if one_hot_label and not num_classes:
	raise ValueError(
	'`num_classes` should be given when requesting one hot label.')

	# Cast to label_dtype (default = tf.int32).
	label = tf.cast(label, dtype=label_dtype)

	if one_hot_label:
	# Replace label index by one hot representation.
	label = tf.one_hot(label, num_classes)
	if len(label.shape.as_list()) > 1:
	label = tf.reduce_sum(label, axis=0)
	if num_classes == 1:
	# The trick for single label.
	label = 1 - label

	return label


	class Decoder(decoder.Decoder):
	"""A tf.Example decoder for classification task."""

	def __init__(self, image_key: str = IMAGE_KEY, label_key: str = LABEL_KEY):
	self._context_description = {
	# One integer stored in context.
	label_key: tf.io.VarLenFeature(tf.int64),
	}
	self._sequence_description = {
	# Each image is a string encoding JPEG.
	image_key: tf.io.FixedLenSequenceFeature((), tf.string),
	}

	def add_feature(self, feature_name: str,
	feature_type: Union[tf.io.VarLenFeature,
	tf.io.FixedLenFeature,
	tf.io.FixedLenSequenceFeature]):
	self._sequence_description[feature_name] = feature_type

	def add_context(self, feature_name: str,
	feature_type: Union[tf.io.VarLenFeature,
	tf.io.FixedLenFeature,
	tf.io.FixedLenSequenceFeature]):
	self._context_description[feature_name] = feature_type

	def decode(self, serialized_example):
	"""Parses a single tf.Example into image and label tensors."""
	result = {}
	context, sequences = tf.io.parse_single_sequence_example(
	serialized_example, self._context_description,
	self._sequence_description)
	result.update(context)
	result.update(sequences)
	for key, value in result.items():
	if isinstance(value, tf.SparseTensor):
	result[key] = tf.sparse.to_dense(value)
	return result


	class VideoTfdsDecoder(decoder.Decoder):
	"""A tf.SequenceExample decoder for tfds video classification datasets."""

	def __init__(self, image_key: str = IMAGE_KEY, label_key: str = LABEL_KEY):
	self._image_key = image_key
	self._label_key = label_key

	def decode(self, features):
	"""Decode the TFDS FeatureDict.

	Args:
	features: features from TFDS video dataset.
	See https://www.tensorflow.org/datasets/catalog/ucf101 for example.
	Returns:
	Dict of tensors.
	"""
	sample_dict = {
	self._image_key: features['video'],
	self._label_key: features['label'],
	}
	return sample_dict


	class Parser(parser.Parser):
	"""Parses a video and label dataset."""

	def __init__(self,
	input_params: exp_cfg.DataConfig,
	image_key: str = IMAGE_KEY,
	label_key: str = LABEL_KEY):
	self._num_frames = input_params.feature_shape[0]
	self._stride = input_params.temporal_stride
	self._random_stride_range = input_params.random_stride_range
	self._num_test_clips = input_params.num_test_clips
	self._min_resize = input_params.min_image_size
	crop_height = input_params.feature_shape[1]
	crop_width = input_params.feature_shape[2]
	self._crop_size = crop_height if crop_height == crop_width else (
	crop_height, crop_width)
	self._num_channels = input_params.feature_shape[3]
	self._num_crops = input_params.num_test_crops
	self._zero_centering_image = input_params.zero_centering_image
	self._one_hot_label = input_params.one_hot
	self._num_classes = input_params.num_classes
	self._image_key = image_key
	self._label_key = label_key
	self._dtype = tf.dtypes.as_dtype(input_params.dtype)
	self._label_dtype = tf.dtypes.as_dtype(input_params.label_dtype)
	self._output_audio = input_params.output_audio
	self._min_aspect_ratio = input_params.aug_min_aspect_ratio
	self._max_aspect_ratio = input_params.aug_max_aspect_ratio
	self._min_area_ratio = input_params.aug_min_area_ratio
	self._max_area_ratio = input_params.aug_max_area_ratio
	self._input_image_format = input_params.input_image_format
	if self._output_audio:
	self._audio_feature = input_params.audio_feature
	self._audio_shape = input_params.audio_feature_shape

	aug_type = input_params.aug_type
	if aug_type is not None:
	if aug_type.type == 'autoaug':
	logging.info('Using AutoAugment.')
	self._augmenter = augment.AutoAugment(
	augmentation_name=aug_type.autoaug.augmentation_name,
	cutout_const=aug_type.autoaug.cutout_const,
	translate_const=aug_type.autoaug.translate_const)
	elif aug_type.type == 'randaug':
	logging.info('Using RandAugment.')
	self._augmenter = augment.RandAugment(
	num_layers=aug_type.randaug.num_layers,
	magnitude=aug_type.randaug.magnitude,
	cutout_const=aug_type.randaug.cutout_const,
	translate_const=aug_type.randaug.translate_const,
	prob_to_apply=aug_type.randaug.prob_to_apply,
	exclude_ops=aug_type.randaug.exclude_ops)
	else:
	raise ValueError(
	'Augmentation policy {} not supported.'.format(aug_type.type))
	else:
	self._augmenter = None

	def _parse_train_data(
	self, decoded_tensors: Dict[str, tf.Tensor]
	) -> Tuple[Dict[str, tf.Tensor], tf.Tensor]:
	"""Parses data for training."""
	# Process image and label.
	image = decoded_tensors[self._image_key]
	image = process_image(
	image=image,
	is_training=True,
	num_frames=self._num_frames,
	stride=self._stride,
	random_stride_range=self._random_stride_range,
	num_test_clips=self._num_test_clips,
	min_resize=self._min_resize,
	crop_size=self._crop_size,
	num_channels=self._num_channels,
	min_aspect_ratio=self._min_aspect_ratio,
	max_aspect_ratio=self._max_aspect_ratio,
	min_area_ratio=self._min_area_ratio,
	max_area_ratio=self._max_area_ratio,
	augmenter=self._augmenter,
	zero_centering_image=self._zero_centering_image,
	input_image_format=self._input_image_format)
	image = tf.cast(image, dtype=self._dtype)

	features = {'image': image}

	label = decoded_tensors[self._label_key]
	label = process_label(label, self._one_hot_label, self._num_classes,
	self._label_dtype)

	if self._output_audio:
	audio = decoded_tensors[self._audio_feature]
	audio = tf.cast(audio, dtype=self._dtype)
	# TODO(yeqing): synchronize audio/video sampling. Especially randomness.
	audio = preprocess_ops_3d.sample_sequence(
	audio, self._audio_shape[0], random=False, stride=1)
	audio = tf.ensure_shape(audio, self._audio_shape)
	features['audio'] = audio

	return features, label

	def _parse_eval_data(
	self, decoded_tensors: Dict[str, tf.Tensor]
	) -> Tuple[Dict[str, tf.Tensor], tf.Tensor]:
	"""Parses data for evaluation."""
	image = decoded_tensors[self._image_key]
	image = process_image(
	image=image,
	is_training=False,
	num_frames=self._num_frames,
	stride=self._stride,
	num_test_clips=self._num_test_clips,
	min_resize=self._min_resize,
	crop_size=self._crop_size,
	num_channels=self._num_channels,
	num_crops=self._num_crops,
	zero_centering_image=self._zero_centering_image,
	input_image_format=self._input_image_format)
	image = tf.cast(image, dtype=self._dtype)
	features = {'image': image}

	label = decoded_tensors[self._label_key]
	label = process_label(label, self._one_hot_label, self._num_classes,
	self._label_dtype)

	if self._output_audio:
	audio = decoded_tensors[self._audio_feature]
	audio = tf.cast(audio, dtype=self._dtype)
	audio = preprocess_ops_3d.sample_sequence(
	audio, self._audio_shape[0], random=False, stride=1)
	audio = tf.ensure_shape(audio, self._audio_shape)
	features['audio'] = audio

	return features, label


	class PostBatchProcessor(object):
	"""Processes a video and label dataset which is batched."""

	def __init__(self, input_params: exp_cfg.DataConfig):
	self._is_training = input_params.is_training

	self._num_frames = input_params.feature_shape[0]
	self._num_test_clips = input_params.num_test_clips
	self._num_test_crops = input_params.num_test_crops

	def __call__(self, features: Dict[str, tf.Tensor],
	label: tf.Tensor) -> Tuple[Dict[str, tf.Tensor], tf.Tensor]:
	"""Parses a single tf.Example into image and label tensors."""
	for key in ['image']:
	if key in features:
	features[key] = postprocess_image(
	image=features[key],
	is_training=self._is_training,
	num_frames=self._num_frames,
	num_test_clips=self._num_test_clips,
	num_test_crops=self._num_test_crops)

	return features, label