ASL-MoViNet-T5-translator

Build error

App Files Files Community

ASL-MoViNet-T5-translator / official /modeling /grad_utils.py

deanna-emery

updates

93528c6 over 2 years ago

raw

history blame contribute delete

6.74 kB

	# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""Some gradient util functions to help users writing custom training loop."""

	from absl import logging

	import tensorflow as tf, tf_keras


	def _filter_grads(grads_and_vars):
	"""Filter out iterable with grad equal to None."""
	grads_and_vars = tuple(grads_and_vars)
	if not grads_and_vars:
	return grads_and_vars
	filtered = []
	vars_with_empty_grads = []
	for grad, var in grads_and_vars:
	if grad is None:
	vars_with_empty_grads.append(var)
	else:
	filtered.append((grad, var))
	filtered = tuple(filtered)
	if not filtered:
	raise ValueError("No gradients provided for any variable: %s." %
	([v.name for _, v in grads_and_vars],))
	if vars_with_empty_grads:
	logging.warning(
	("Gradients do not exist for variables %s when minimizing the loss."),
	([v.name for v in vars_with_empty_grads]))
	return filtered


	def _filter_and_allreduce_gradients(grads_and_vars,
	allreduce_precision="float32",
	bytes_per_pack=0):
	"""Filter None grads and then allreduce gradients in specified precision.

	This utils function is used when users intent to explicitly allreduce
	gradients and customize gradients operations before and after allreduce.
	The allreduced gradients are then passed to optimizer.apply_gradients(
	experimental_aggregate_gradients=False).

	Args:
	grads_and_vars: gradients and variables pairs.
	allreduce_precision: Whether to allreduce gradients in float32 or float16.
	bytes_per_pack: A non-negative integer. Breaks collective operations into
	packs of certain size. If it's zero, all gradients are in one pack.

	Returns:
	pairs of allreduced non-None gradients and variables.
	"""
	filtered_grads_and_vars = _filter_grads(grads_and_vars)
	(grads, variables) = zip(*filtered_grads_and_vars)
	if allreduce_precision == "float16":
	grads = [tf.cast(grad, "float16") for grad in grads]
	hints = tf.distribute.experimental.CommunicationOptions(
	bytes_per_pack=bytes_per_pack)
	allreduced_grads = tf.distribute.get_strategy( # pylint: disable=protected-access
	).extended._replica_ctx_all_reduce(tf.distribute.ReduceOp.SUM, grads, hints)
	if allreduce_precision == "float16":
	allreduced_grads = [tf.cast(grad, "float32") for grad in allreduced_grads]
	return allreduced_grads, variables


	def _run_callbacks(callbacks, grads_and_vars):
	for callback in callbacks:
	grads_and_vars = callback(grads_and_vars)
	return grads_and_vars


	def minimize_using_explicit_allreduce(tape,
	optimizer,
	loss,
	trainable_variables,
	pre_allreduce_callbacks=None,
	post_allreduce_callbacks=None,
	allreduce_bytes_per_pack=0):
	"""Minimizes loss for one step by updating `trainable_variables`.

	Minimizes loss for one step by updating `trainable_variables`.
	This explicitly performs gradient allreduce, instead of relying on implicit
	allreduce in optimizer.apply_gradients(). If training using FP16 mixed
	precision, explicit allreduce will aggregate gradients in FP16 format.
	For TPU and GPU training using FP32, explicit allreduce will aggregate
	gradients in FP32 format.

	Args:
	tape: An instance of `tf.GradientTape`.
	optimizer: An instance of `tf_keras.optimizers.Optimizer`.
	loss: the loss tensor.
	trainable_variables: A list of model Variables.
	pre_allreduce_callbacks: A list of callback functions that takes gradients
	and model variables pairs as input, manipulate them, and returns a new
	gradients and model variables pairs. The callback functions will be
	invoked in the list order and before gradients are allreduced. With
	mixed precision training, the pre_allreduce_allbacks will be applied on
	scaled_gradients. Default is no callbacks.
	post_allreduce_callbacks: A list of callback functions that takes
	gradients and model variables pairs as input, manipulate them, and
	returns a new gradients and model variables paris. The callback
	functions will be invoked in the list order and right before gradients
	are applied to variables for updates. Default is no callbacks.
	allreduce_bytes_per_pack: A non-negative integer. Breaks collective
	operations into packs of certain size. If it's zero, all gradients are
	in one pack.
	"""
	if isinstance(optimizer,
	tf_keras.mixed_precision.LossScaleOptimizer):
	# FP16 GPU code path
	with tape:
	scaled_loss = optimizer.get_scaled_loss(loss)
	scaled_grads = tape.gradient(scaled_loss, trainable_variables)
	grads_and_vars = zip(scaled_grads, trainable_variables)
	if pre_allreduce_callbacks:
	grads_and_vars = _run_callbacks(pre_allreduce_callbacks, grads_and_vars)
	(allreduced_scaled_grads,
	filtered_training_vars) = _filter_and_allreduce_gradients(
	grads_and_vars,
	allreduce_precision="float16",
	bytes_per_pack=allreduce_bytes_per_pack)
	allreduced_unscaled_grads = optimizer.get_unscaled_gradients(
	allreduced_scaled_grads)
	grads_and_vars = zip(allreduced_unscaled_grads, filtered_training_vars)
	else:
	# TPU or FP32 GPU code path
	grads = tape.gradient(loss, trainable_variables)
	grads_and_vars = zip(grads, trainable_variables)
	if pre_allreduce_callbacks:
	grads_and_vars = _run_callbacks(pre_allreduce_callbacks, grads_and_vars)
	(allreduced_grads,
	filtered_training_vars) = _filter_and_allreduce_gradients(
	grads_and_vars,
	allreduce_precision="float32",
	bytes_per_pack=allreduce_bytes_per_pack)
	grads_and_vars = zip(allreduced_grads, filtered_training_vars)
	if post_allreduce_callbacks:
	grads_and_vars = _run_callbacks(post_allreduce_callbacks, grads_and_vars)
	optimizer.apply_gradients(
	grads_and_vars, experimental_aggregate_gradients=False)