ASL-MoViNet-T5-translator

Sleeping

App Files Files Community

ASL-MoViNet-T5-translator / official /modeling /optimization /lr_schedule.py

deanna-emery

updates

93528c6 about 2 years ago

raw

history blame contribute delete

18.5 kB

	# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""Learning rate schedule classes."""

	import math
	from typing import Mapping, Any, Union, Optional

	import tensorflow as tf, tf_keras


	def _make_offset_wrapper(new_class_name: str, base_lr_class):
	"""Generates a offset wrapper of learning rate schedule.

	It will returns a subclass of the `base_lr_class`, the subclass takes an
	`offset` argument in the constructor. When the new class instance is called,
	the behavior is:
	new_class_object(step) = base_lr_class_object(step - offset)

	Example:
	CosineDecayWithOffset = _make_offset_wrapper(
	'CosineDecayWithOffset',
	tf_keras.optimizers.schedules.CosineDecay)
	# Use the lr:
	lr = CosineDecayWithOffset(offset=100, initial_learning_rate=0.1,
	decay_steps=1000)
	lr(101) # equals to keras.optimizers.schedules.CosineDecay(...)(101-100)

	Args:
	new_class_name: the name of the new class.
	base_lr_class: the base learning rate schedule class. Should be subclass of
	tf_keras.optimizers.schedules.LearningRateSchedule

	Returns:
	A new class (subclass of the base_lr_class) that can take an offset.
	"""
	assert issubclass(base_lr_class,
	tf_keras.optimizers.schedules.LearningRateSchedule), (
	"base_lr_class should be subclass of keras "
	f"LearningRateSchedule, got {base_lr_class}")

	# pylint: disable=protected-access,pointless-statement
	def offset_learning_rate_init(self, offset=0, **kwargs):
	"""Construct learning rate schedule object.

	When this object is called, its behavior is
	self.__call__(step) == base_lr_class.__call__(step - offset)
	Args:
	self: this object.
	offset: The offset when computing the learning rate schedule.
	**kwargs: Pass through to base learning rate class constructor.
	"""
	base_lr_class.__init__(self, **kwargs)
	self._offset = offset

	def offset_learning_rate_call(self, step):
	step = tf.cast(step - self._offset, tf.float32)
	return base_lr_class.__call__(self, step)

	# pylint: enable=protected-access,pointless-statement

	return type(
	new_class_name, (base_lr_class,), {
	"base_lr_class": base_lr_class,
	"__init__": offset_learning_rate_init,
	"__call__": offset_learning_rate_call
	})


	PiecewiseConstantDecayWithOffset = _make_offset_wrapper(
	"PiecewiseConstantDecayWithOffset",
	tf_keras.optimizers.schedules.PiecewiseConstantDecay)
	PolynomialDecayWithOffset = _make_offset_wrapper(
	"PolynomialDecayWithOffset", tf_keras.optimizers.schedules.PolynomialDecay)
	ExponentialDecayWithOffset = _make_offset_wrapper(
	"ExponentialDecayWithOffset",
	tf_keras.optimizers.schedules.ExponentialDecay)
	CosineDecayWithOffset = _make_offset_wrapper(
	"CosineDecayWithOffset",
	tf_keras.optimizers.schedules.CosineDecay,
	)


	class LinearWarmup(tf_keras.optimizers.schedules.LearningRateSchedule):
	"""Linear warmup schedule."""

	def __init__(self,
	after_warmup_lr_sched: Union[
	tf_keras.optimizers.schedules.LearningRateSchedule, float],
	warmup_steps: int,
	warmup_learning_rate: float,
	name: Optional[str] = None):
	"""Add linear warmup schedule to a learning rate schedule.

	warmup_lr is the initial learning rate, the final learning rate of the
	init_warmup period is the initial learning rate of lr_schedule in use.
	The learning rate at each step linearly increased according to the following
	formula:
	learning_rate = warmup_lr + step / warmup_steps
	* (final_warmup_lr - warmup_lr).
	Using warmup overrides the learning rate schedule by the number of warmup
	steps.

	Args:
	after_warmup_lr_sched: tf_keras.optimizers.schedules .LearningRateSchedule
	or a constant.
	warmup_steps: Number of the warmup steps.
	warmup_learning_rate: Initial learning rate for the warmup.
	name: Optional, name of warmup schedule.
	"""
	super().__init__()
	self._name = name
	self._after_warmup_lr_sched = after_warmup_lr_sched
	self._warmup_steps = warmup_steps
	self._init_warmup_lr = warmup_learning_rate
	if isinstance(after_warmup_lr_sched,
	tf_keras.optimizers.schedules.LearningRateSchedule):
	self._final_warmup_lr = after_warmup_lr_sched(warmup_steps)
	else:
	self._final_warmup_lr = tf.cast(after_warmup_lr_sched, dtype=tf.float32)

	def __call__(self, step: int):

	global_step = tf.cast(step, dtype=tf.float32)

	linear_warmup_lr = (
	self._init_warmup_lr + global_step / self._warmup_steps *
	(self._final_warmup_lr - self._init_warmup_lr))

	if isinstance(self._after_warmup_lr_sched,
	tf_keras.optimizers.schedules.LearningRateSchedule):
	after_warmup_lr = self._after_warmup_lr_sched(step)
	else:
	after_warmup_lr = tf.cast(self._after_warmup_lr_sched, dtype=tf.float32)

	lr = tf.cond(global_step < self._warmup_steps,
	lambda: linear_warmup_lr,
	lambda: after_warmup_lr)
	return lr

	def get_config(self) -> Mapping[str, Any]:
	if isinstance(self._after_warmup_lr_sched,
	tf_keras.optimizers.schedules.LearningRateSchedule):
	config = {
	"after_warmup_lr_sched": self._after_warmup_lr_sched.get_config()} # pytype: disable=attribute-error
	else:
	config = {"after_warmup_lr_sched": self._after_warmup_lr_sched} # pytype: disable=attribute-error

	config.update({
	"warmup_steps": self._warmup_steps,
	"warmup_learning_rate": self._init_warmup_lr,
	"name": self._name
	})
	return config


	class PolynomialWarmUp(tf_keras.optimizers.schedules.LearningRateSchedule):
	"""Applies polynomial warmup schedule on a given learning rate decay schedule."""

	def __init__(self,
	after_warmup_lr_sched: Union[
	tf_keras.optimizers.schedules.LearningRateSchedule, float],
	warmup_steps: int,
	power: float = 1.0,
	name: str = "PolynomialWarmup"):
	super().__init__()
	if isinstance(after_warmup_lr_sched,
	tf_keras.optimizers.schedules.LearningRateSchedule):
	self._initial_learning_rate = after_warmup_lr_sched(warmup_steps)
	else:
	self._initial_learning_rate = tf.cast(
	after_warmup_lr_sched, dtype=tf.float32)

	self._warmup_steps = warmup_steps
	self._power = power
	self._after_warmup_lr_sched = after_warmup_lr_sched
	self._name = name

	def __call__(self, step):
	with tf.name_scope(self._name or "PolynomialWarmUp") as name:
	# Implements polynomial warmup. i.e., if global_step < warmup_steps, the
	# learning rate will be `global_step/num_warmup_steps * init_lr`.
	global_step_float = tf.cast(step, tf.float32)
	warmup_steps_float = tf.cast(self._warmup_steps, tf.float32)

	if self._warmup_steps <= 0:
	warmup_percent_done = 1.0
	else:
	# A zero `step` may cause Inf. So make `step` positive.
	step_non_zero = tf.math.maximum(global_step_float, 1.0)
	warmup_percent_done = step_non_zero / warmup_steps_float

	warmup_learning_rate = (
	self._initial_learning_rate *
	tf.math.pow(warmup_percent_done, self._power))

	if isinstance(self._after_warmup_lr_sched,
	tf_keras.optimizers.schedules.LearningRateSchedule):
	after_warmup_lr = self._after_warmup_lr_sched(step)
	else:
	after_warmup_lr = tf.cast(self._after_warmup_lr_sched, dtype=tf.float32)

	return tf.cond(
	global_step_float < warmup_steps_float,
	lambda: warmup_learning_rate,
	lambda: after_warmup_lr,
	name=name)

	def get_config(self) -> Mapping[str, Any]:
	if isinstance(self._after_warmup_lr_sched,
	tf_keras.optimizers.schedules.LearningRateSchedule):
	config = {
	"after_warmup_lr_sched": self._after_warmup_lr_sched.get_config()} # pytype: disable=attribute-error
	else:
	config = {"after_warmup_lr_sched": self._after_warmup_lr_sched} # pytype: disable=attribute-error

	config.update({
	"warmup_steps": self._warmup_steps,
	"power": self._power,
	"name": self._name
	})
	return config


	class DirectPowerDecay(tf_keras.optimizers.schedules.LearningRateSchedule):
	"""Learning rate schedule follows lr * (step)^power."""

	def __init__(self,
	initial_learning_rate: float,
	power: float = 1.0,
	name: str = "DirectPowerDecay"):
	"""Initialize configuration of the learning rate schedule.

	Args:
	initial_learning_rate: The initial learning rate.
	power: The order of the polynomial.
	name: Optional, name of learning rate schedule.
	"""
	super().__init__()
	self._initial_learning_rate = initial_learning_rate
	self._power = power
	self._name = name

	def __call__(self, step):
	with tf.name_scope(self._name or "DirectPowerDecay"):
	step = tf.cast(step, tf.float32)
	learning_rate = self._initial_learning_rate
	# A zero `step` may cause Inf. So make `step` positive.
	step_non_zero = tf.math.maximum(step, 1.0)
	learning_rate *= tf.math.pow(step_non_zero, self._power)
	return learning_rate

	def get_config(self):
	"""Get the configuration of the learning rate schedule."""
	return {
	"initial_learning_rate": self._initial_learning_rate,
	"power": self._power,
	"name": self._name,
	}


	class PowerAndLinearDecay(tf_keras.optimizers.schedules.LearningRateSchedule):
	"""Learning rate schedule with multiplied by linear decay at the end.

	The schedule has the following behavoir.
	Let offset_step = step - offset.
	1) offset_step < 0, the actual learning rate equals initial_learning_rate.
	2) offset_step <= total_decay_steps * (1 - linear_decay_fraction), the
	actual learning rate equals lr * offset_step^power.
	3) total_decay_steps * (1 - linear_decay_fraction) <= offset_step <
	total_decay_steps, the actual learning rate equals lr * offset_step^power *
	(total_decay_steps - offset_step) / (total_decay_steps *
	linear_decay_fraction).
	4) offset_step >= total_decay_steps, the actual learning rate equals zero.
	"""

	def __init__(self,
	initial_learning_rate: float,
	total_decay_steps: int,
	power: float = 1.0,
	linear_decay_fraction: float = 0.1,
	offset: int = 0,
	name: str = "PowerAndLinearDecay"):
	"""Initialize configuration of the learning rate schedule.

	Args:
	initial_learning_rate: The initial learning rate.
	total_decay_steps: The total number of steps for power + linear decay.
	power: The order of the polynomial.
	linear_decay_fraction: In the last `linear_decay_fraction` steps, the
	learning rate will be multiplied by a linear decay.
	offset: The offset applied to steps.
	name: Optional, name of learning rate schedule.
	"""
	super().__init__()
	self._initial_learning_rate = initial_learning_rate
	self._total_decay_steps = total_decay_steps
	self._power = power
	self._linear_decay_fraction = linear_decay_fraction
	self._offset = offset
	self._name = name

	def __call__(self, step):
	with tf.name_scope(self._name or "PowerAndLinearDecay"):
	step = tf.cast(step - self._offset, tf.float32)
	learning_rate = self._initial_learning_rate
	# A zero `step` may cause Inf. So make `step` positive.
	step_non_zero = tf.math.maximum(step, 1.0)
	learning_rate *= tf.math.pow(step_non_zero, self._power)
	if self._total_decay_steps * self._linear_decay_fraction > 0:
	learning_rate *= tf.minimum(
	1.0, (self._total_decay_steps - step) /
	(self._total_decay_steps * self._linear_decay_fraction))
	learning_rate = tf.maximum(0.0, learning_rate)
	return learning_rate

	def get_config(self):
	"""Get the configuration of the learning rate schedule."""
	return {
	"initial_learning_rate": self._initial_learning_rate,
	"total_decay_steps": self._total_decay_steps,
	"power": self._power,
	"linear_decay_fraction": self._linear_decay_fraction,
	"offset": self._offset,
	"name": self._name,
	}


	class PowerDecayWithOffset(tf_keras.optimizers.schedules.LearningRateSchedule):
	"""Power learning rate decay with offset.

	Learning rate equals to `pre_offset_learning_rate` if `step` < `offset`.
	Otherwise, learning rate equals to lr * (step - offset)^power.
	"""

	def __init__(self,
	initial_learning_rate: float,
	power: float = 1.0,
	offset: int = 0,
	pre_offset_learning_rate: float = 1.0e6,
	name: str = "PowerDecayWithOffset"):
	"""Initialize configuration of the learning rate schedule.

	Args:
	initial_learning_rate: The initial learning rate.
	power: The order of the polynomial.
	offset: The offset when computing the power decay.
	pre_offset_learning_rate: The maximum learning rate we'll use.
	name: Optional, name of learning rate schedule.
	"""
	super().__init__()
	self._initial_learning_rate = initial_learning_rate
	self._power = power
	self._offset = offset
	self._pre_offset_lr = pre_offset_learning_rate
	self._name = name

	def __call__(self, step):
	with tf.name_scope(self._name or "PowerDecayWithOffset"):
	step = tf.cast(step, tf.float32)
	lr_after_offset = tf.math.pow(
	tf.math.maximum(step - self._offset, 1.0), self._power) * (
	self._initial_learning_rate)

	sign = tf.cast(step > self._offset, tf.float32)
	lr_combined = (1.0 - sign) * self._pre_offset_lr + sign * lr_after_offset
	# Power may give infinitely large LR. So cap it with pre_offset_lr.
	return tf.math.minimum(lr_combined, self._pre_offset_lr)

	def get_config(self):
	"""Get the configuration of the learning rate schedule."""
	return {
	"initial_learning_rate": self._initial_learning_rate,
	"power": self._power,
	"offset": self._offset,
	"pre_offset_learning_rate": self._pre_offset_lr,
	"name": self._name,
	}


	class StepCosineDecayWithOffset(
	tf_keras.optimizers.schedules.LearningRateSchedule):
	"""Stepwise cosine learning rate decay with offset.

	Learning rate is equivalent to one or more cosine decay(s) starting and
	ending at each interval.

	ExampleL

	```python
	boundaries: [100000, 110000]
	values: [1.0, 0.5]
	lr_decayed_fn = (
	lr_schedule.StepCosineDecayWithOffset(
	boundaries,
	values))
	```

	from 0 to 100000 step, it will cosine decay from 1.0 to 0.5
	from 100000 to 110000 step, it cosine decay from 0.5 to 0.0
	"""

	def __init__(self,
	boundaries,
	values,
	offset: int = 0,
	name: str = "StepCosineDecayWithOffset"):
	"""Initialize configuration of the learning rate schedule.

	Args:
	boundaries: A list of `Tensor`s or `int`s with strictly
	increasing entries, and with all elements having the same type as the
	optimizer step.
	values: A list of `Tensor`s or `float`s that specifies the
	values for the intervals defined by `boundaries`. It should have one
	more element than `boundaries`, and all elements should have the same
	type.
	offset: The offset when computing the power decay.
	name: Optional, name of learning rate schedule.
	"""
	super().__init__()
	self.values = values
	self.boundaries = boundaries
	self.offset = offset
	self.name = name

	if len(self.values) < 1:
	raise ValueError(f"Expect non empty {self.values}")
	if len(self.boundaries) != len(self.values):
	raise ValueError(
	"Boundaries length is equal to learning rate levels length"
	f"{len(self.boundaries)} != {len(self.values)}")

	self.total_steps = (
	[boundaries[i + 1] - boundaries[i] for i in range(len(boundaries) - 1)
	] + [0])

	def __call__(self, global_step):
	with tf.name_scope(self.name or "StepCosineDecayWithOffset"):
	global_step = tf.cast(global_step - self.offset, tf.float32)
	lr_levels = self.values
	lr_steps = self.boundaries
	level_total_steps = self.total_steps
	num_levels = len(lr_levels)

	init_lr = lr_levels[0]
	next_init_lr = lr_levels[1] if num_levels > 1 else 0.

	init_total_steps = level_total_steps[0]

	cosine_learning_rate = ((init_lr - next_init_lr) * (tf.cos(
	tf.constant(math.pi) * (global_step) /
	(init_total_steps)) + 1.0) / 2.0 + next_init_lr)
	learning_rate = cosine_learning_rate

	for i in range(1, num_levels):
	next_init_lr = lr_levels[i]
	next_start_step = lr_steps[i]
	next_total_steps = level_total_steps[i]
	next_next_init_lr = lr_levels[i + 1] if num_levels > i + 1 else 0.

	next_cosine_learning_rate = ((next_init_lr - next_next_init_lr) *
	(tf.cos(
	tf.constant(math.pi) *
	(global_step - next_start_step) /
	(next_total_steps)) + 1.0) / 2.0 +
	next_next_init_lr)
	learning_rate = tf.where(global_step >= next_start_step,
	next_cosine_learning_rate, learning_rate)

	return learning_rate

	def get_config(self):
	return {
	"boundaries": self.boundaries,
	"values": self.values,
	"offset": self.offset,
	"name": self.name
	}