owlv2 / scenic /train_lib /train_utils.py

Upload 549 files

742a3d1 verified 2 days ago

45.6 kB

	# Copyright 2024 The Scenic Authors.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""Utility functions for Training."""

	import collections.abc as collections
	import copy
	import functools
	import os
	import re
	import time
	from typing import Any, Callable, Dict, Mapping, Optional, Sequence, Tuple, Union

	from absl import logging
	from clu import metric_writers
	import flax
	from flax import jax_utils
	from flax import struct
	import flax.linen as nn
	from flax.training import checkpoints
	import jax
	import jax.numpy as jnp
	import ml_collections
	import numpy as np
	import optax
	from scenic.common_lib import debug_utils
	from scenic.dataset_lib import dataset_utils
	from scenic.dataset_lib import datasets
	from scenic.train_lib import optimizers
	from tensorflow.io import gfile

	# JAX team is working on type annotation for pytree:
	# https://github.com/jax-ml/jax/issues/1555
	PyTree = Any
	PRNGKey = jnp.ndarray


	@struct.dataclass
	class TrainState:
	"""Dataclass to keep track of state of training.

	The state of training is structured as a struct.dataclass, which enables
	instances of this class to be passed into jax transformations like tree_map
	and pmap.
	"""

	tx: Optional[optax.GradientTransformation] = struct.field(
	default=None, pytree_node=False
	)
	opt_state: Optional[optax.OptState] = None
	params: Optional[Any] = struct.field(default_factory=dict)
	global_step: Optional[int] = 0
	model_state: Optional[Any] = struct.field(default_factory=dict)
	rng: Optional[jnp.ndarray] = None
	metadata: Optional[Dict[str, Any]] = None
	# NOTE: When using the raw TrainState as the target for checkpoint restoration
	# in Flax, you should provide the pytree structure, otherwise it might just
	# silenty ignore restoring the checkpoint subtree if you use with an empty
	# dict when setting `allow_partial_mpa_restoration=True` and if you set it
	# to None (e.g., for `metadata`` above), Flax replaces it with a state dict.

	def __getitem__(self, item):
	"""Make TrainState a subscriptable object."""
	return getattr(self, item)

	def get(self, keyname: str, default: Optional[Any] = None) -> Any:
	"""Return the value for key if it exists otherwise the default."""
	try:
	return self[keyname]
	except KeyError:
	return default


	def expand_dims_for_specs(xs, specs):
	return jax.tree.map(
	lambda s, x: jax.tree.map(
	functools.partial(jnp.expand_dims, axis=tuple(range(len(s)))),
	x,
	),
	specs,
	xs,
	)


	def squeeze_for_specs(xs, specs):
	return jax.tree.map(
	lambda s, x: jax.tree.map(
	functools.partial(jnp.squeeze, axis=tuple(range(len(s)))),
	x,
	),
	specs,
	xs,
	)


	def initialize_model(
	*,
	model_def: nn.Module,
	input_spec: Sequence[
	Union[Tuple[Tuple[int, ...], jnp.dtype], Tuple[int, ...], None]
	],
	config: ml_collections.ConfigDict,
	rngs: Union[jnp.ndarray, Mapping[str, jnp.ndarray]],
	train: Optional[bool] = False,
	**model_kwargs,
	) -> Tuple[PyTree, PyTree, int, Optional[float]]:
	"""Initializes parameters and model state.

	Args:
	model_def: Definition of a model.
	input_spec: An iterable of (shape, dtype) pairs specifying the shape and
	dtype of the inputs. If unspecified the dtype is float32.
	config: Configurations of the initialization.
	rngs: Jax rng keys.
	train: If the scenic model should be initialized in the train mode.
	**model_kwargs: Kwargs passed to flax model initialization.

	Returns:
	Initial params, Init model_state, number of trainable_params, and gflops.
	"""
	batch_size = (
	(config.batch_size // jax.device_count())
	if config.get('batch_size')
	else None
	)
	dummy_input = []
	for spec in input_spec:
	if spec is not None:
	in_st = debug_utils.input_spec_to_jax_shape_dtype_struct(
	spec, batch_size=batch_size
	)
	dummy_input.append(jnp.zeros(in_st.shape, in_st.dtype))
	else:
	dummy_input.append(None)

	# We want all parameters to be created in host RAM, not on any device, they'll
	# be sent there later as needed, otherwise we already encountered two
	# situations where we allocate them twice.
	@functools.partial(jax.jit, backend='cpu')
	def _initialize_model(rngs):
	"""Initialization function to be jitted."""
	init_model_state, init_params = flax.core.pop(
	flax.core.freeze(
	model_def.init(
	rngs, dummy_input, train=train, debug=False, *model_kwargs
	)
	),
	'params',
	)
	# Set bias in the head to low value, such that loss is small initially.
	if config.get('init_head_bias', None) is not None:
	init_params = flax.core.unfreeze(init_params)
	init_params['output_projection'] = optimizers.tree_map_with_names(
	lambda p: jnp.full_like(p, config.init_head_bias),
	init_params['output_projection'],
	match_name_fn=lambda name: 'bias' in name,
	)
	init_params = flax.core.freeze(init_params)
	return init_params, init_model_state

	if not isinstance(rngs, dict):
	rngs = {'params': rngs}
	init_params, init_model_state = _initialize_model(rngs)
	# Pop out params rng:
	rngs.pop('params')

	# Count number of trainable parameters:
	num_trainable_params = debug_utils.log_param_shapes(init_params)

	# Count gflops:
	count_flops = config.get(
	'count_flops', ml_collections.ConfigDict({'count_flops': True})
	)
	if count_flops:
	variables = {'params': init_params, **init_model_state}
	flops = debug_utils.compute_flops(
	flax_model_apply_fn=functools.partial(
	model_def.apply,
	variables,
	train=False,
	debug=False,
	rngs=rngs,
	**model_kwargs,
	),
	input_spec=count_flops.get('input_spec', input_spec),
	fuse_multiply_add=count_flops.get('fuse_multiply_add', True),
	)
	gflops = flops / (10**9)
	else:
	gflops = None

	return init_params, init_model_state, num_trainable_params, gflops


	def initialize_model_with_pytree(
	*,
	model_def: nn.Module,
	input_spec: PyTree,
	config: ml_collections.ConfigDict,
	rngs: Union[jnp.ndarray, Mapping[str, jnp.ndarray]],
	unpack_input: bool = True,
	**model_kwargs,
	) -> Tuple[PyTree, PyTree, int, Optional[float]]:
	"""Initializes parameters and model state with a pytree input_spec.

	This is an extension of the above initialize_model function where we can put
	pytree `input_spec`. We keep the original function for backward compatibility.
	If the root type of `input_spec` is `Sequence`, each element is fed to the
	model as position arguments whereas they are fed as keyword arguments if the
	root type is `dict`.

	Args:
	model_def: Definition of a model.
	input_spec: A PyTree whose leaves are (shape, dtype) pairs specifying the
	shape and dtype of the inputs. If unspecified the dtype is float32.
	config: Configurations of the initialization.
	rngs: Jax rng keys.
	unpack_input: Unpack the pytree when feeding it to the model.
	**model_kwargs: Kwargs passed to flax model initialization.

	Returns:
	Initial params, Init model_state, number of trainable_params, and gflops.
	"""
	batch_size = (
	(config.batch_size // jax.device_count())
	if config.get('batch_size')
	else None
	)

	def check_leaf_spec(spec: Sequence[PyTree]) -> bool:
	return (
	len(spec) == 2
	and isinstance(spec[0], collections.Sequence)
	and all(isinstance(i, int) for i in spec[0])
	and isinstance(spec[1], jnp.dtype)
	) or (all(isinstance(i, int) for i in spec[0]))

	def create_dummy_input(spec: PyTree) -> PyTree:
	if isinstance(spec, dict):
	return {k: create_dummy_input(v) for k, v in spec.items()}
	elif isinstance(spec, collections.Sequence):
	if check_leaf_spec(spec):
	in_st = debug_utils.input_spec_to_jax_shape_dtype_struct(
	spec, batch_size=batch_size
	)
	return jnp.zeros(in_st.shape, in_st.dtype)
	else:
	return tuple(create_dummy_input(child) for child in spec)
	elif spec is None:
	return None
	else:
	raise NotImplementedError('Unsupported spec type.', type(spec))

	dummy_input = create_dummy_input(input_spec)

	# We want all parameters to be created in host RAM, not on any device, they'll
	# be sent there later as needed, otherwise we already encountered two
	# situations where we allocate them twice.
	@functools.partial(jax.jit, backend='cpu')
	def _initialize_model(rngs):
	"""Initialization function to be jitted."""
	# If dummy_input is a dict, we feed inputs as keyword arguments, otherwise
	# feed as position arguments.
	if isinstance(dummy_input, dict) and unpack_input:
	init_model_state, init_params = flax.core.pop(
	flax.core.freeze(
	model_def.init(
	rngs, dummy_input, train=False, debug=False, model_kwargs
	)
	),
	'params',
	)
	elif isinstance(dummy_input, collections.Sequence) and unpack_input:
	init_model_state, init_params = flax.core.pop(
	flax.core.freeze(
	model_def.init(
	rngs, dummy_input, train=False, debug=False, *model_kwargs
	)
	),
	'params',
	)
	else:
	init_model_state, init_params = flax.core.pop(
	flax.core.freeze(
	model_def.init(
	rngs, dummy_input, train=False, debug=False, **model_kwargs
	)
	),
	'params',
	)
	# Set bias in the head to low value, such that loss is small initially.
	if config.get('init_head_bias', None) is not None:
	init_params = flax.core.unfreeze(init_params)
	init_params['output_projection'] = optimizers.tree_map_with_names(
	lambda p: jnp.full_like(p, config.init_head_bias),
	init_params['output_projection'],
	match_name_fn=lambda name: 'bias' in name,
	)
	init_params = flax.core.freeze(init_params)
	return init_params, init_model_state

	if not isinstance(rngs, dict):
	rngs = {'params': rngs}
	init_params, init_model_state = _initialize_model(rngs)
	# Pop out params rng:
	rngs.pop('params')

	# Count number of trainable parameters:
	num_trainable_params = debug_utils.log_param_shapes(init_params)

	# Count gflops:
	count_flops = config.get(
	'count_flops', ml_collections.ConfigDict({'count_flops': True})
	)
	if count_flops:
	variables = {'params': init_params, **init_model_state}
	flops = debug_utils.compute_flops_with_pytree(
	flax_model_apply_fn=functools.partial(
	model_def.apply,
	variables,
	train=False,
	debug=False,
	rngs=rngs,
	**model_kwargs,
	),
	input_spec=count_flops.get('input_spec', input_spec),
	unpack_input=unpack_input,
	fuse_multiply_add=count_flops.get('fuse_multiply_add', True),
	)
	gflops = flops / (10**9)
	else:
	gflops = None

	return init_params, init_model_state, num_trainable_params, gflops


	def get_dataset(
	config: ml_collections.ConfigDict,
	data_rng: PRNGKey,
	*,
	num_local_shards: Optional[int] = None,
	dataset_service_address: Optional[str] = None,
	dataset_name: Optional[str] = None,
	dataset_configs: Optional[ml_collections.ConfigDict] = None,
	**kwargs: Any,
	) -> dataset_utils.Dataset:
	"""Creates dataset.

	By default, the values in the config file are used.
	However, if the optional `dataset_name` and `dataset_configs` are passed,
	those are used instead.

	Args:
	config: The configuration of the experiment.
	data_rng: Random number generator key to use for the dataset.
	num_local_shards: Number of shards for each batch. So (bs, ...) becomes
	(num_local_shards, bs//num_local_shards, ...). If not specified, it will
	be number of local devices.
	dataset_service_address: Used when using the tf.data.experimental.service
	dataset_name: Name of dataset to load, if not reading from the config.
	dataset_configs: Configuration of the dataset, if not reading directly from
	the config.
	**kwargs: Keyword arguments passed to the dataset builders.

	Returns:
	A dataset_utils.Dataset object.
	"""
	device_count = jax.device_count()
	logging.info('device_count: %d', device_count)
	logging.info('num_hosts : %d', jax.process_count())
	logging.info('host_id : %d', jax.process_index())

	dataset_name = dataset_name or config.dataset_name
	dataset_builder = datasets.get_dataset(dataset_name)

	batch_size = config.batch_size
	if batch_size % device_count > 0:
	raise ValueError(
	f'Batch size ({batch_size}) must be divisible by the '
	f'number of devices ({device_count})'
	)

	eval_batch_size = config.get('eval_batch_size', batch_size)
	if eval_batch_size % device_count > 0:
	raise ValueError(
	f'Eval batch size ({eval_batch_size}) must be divisible '
	f'by the number of devices ({device_count})'
	)

	local_batch_size = batch_size // jax.process_count()
	eval_local_batch_size = eval_batch_size // jax.process_count()
	device_batch_size = batch_size // device_count
	logging.info('local_batch_size : %d', local_batch_size)
	logging.info('device_batch_size : %d', device_batch_size)

	shuffle_seed = config.get('shuffle_seed', None)
	if dataset_service_address and shuffle_seed is not None:
	raise ValueError(
	'Using dataset service with a random seed causes each '
	'worker to produce exactly the same data. Add '
	'config.shuffle_seed = None to your config if you want '
	'to run with dataset service.'
	)

	dataset_configs = dataset_configs or config.get('dataset_configs', {})
	num_local_shards = num_local_shards or jax.local_device_count()
	dataset = dataset_builder(
	batch_size=local_batch_size,
	eval_batch_size=eval_local_batch_size,
	num_shards=num_local_shards,
	dtype_str=config.data_dtype_str,
	rng=data_rng,
	shuffle_seed=shuffle_seed,
	dataset_configs=dataset_configs,
	dataset_service_address=dataset_service_address,
	**kwargs,
	)

	return dataset


	def initialize_multitask_model(
	*,
	model_def: nn.Module,
	input_spec: Dict[
	Tuple[Tuple[str, Any], ...],
	Sequence[Union[Tuple[Tuple[int, ...], jnp.dtype], Tuple[int, ...]]],
	],
	config: ml_collections.ConfigDict,
	rngs: Union[jnp.ndarray, Mapping[str, jnp.ndarray]],
	) -> Tuple[PyTree, PyTree, int, Optional[Dict[str, float]]]:
	"""Initializes parameters and model state.

	Args:
	model_def: Definition of a model.
	input_spec: A dictionary from a dict of keyword arguments to an iterable of
	(shape, dtype) pairs specifying the shape and dtype of the inputs. If
	unspecified the dtype is float32.
	config: Configurations of the initialization.
	rngs: Jax rng keys.

	Returns:
	Initial params, Init model_state, and number of trainable_params.
	"""

	def init_fn(model_def):
	for kwargs, in_spec in input_spec.items():
	if config.get('batch_sizes') is not None:
	batch_size = config.batch_sizes.get(dict(kwargs)['dataset'])
	else:
	batch_size = config.batch_size

	batch_size = (batch_size // jax.device_count()) if batch_size else None

	input_shapetype = [
	debug_utils.input_spec_to_jax_shape_dtype_struct(
	spec, batch_size=batch_size
	)
	for spec in in_spec
	]
	dummy_input = []
	for in_st in input_shapetype:
	dummy_input.append(jnp.zeros(in_st.shape, in_st.dtype))
	model_def(dummy_input, train=False, debug=False, *dict(kwargs))

	# We want all parameters to be created in host RAM, not on any device, they'll
	# be sent there later as needed, otherwise we already encountered two
	# situations where we allocate them twice.
	@functools.partial(jax.jit, backend='cpu')
	def _initialize_model(rngs):
	"""Initialization function to be jitted."""
	init_model_state, init_params = flax.core.pop(
	flax.core.freeze(nn.init(fn=init_fn, module=model_def)(rngs)), 'params'
	)
	# Set bias in the head to low value, such that loss is small initially.
	if (
	config.get('init_head_bias', None) is not None
	and 'output_projection' in init_params
	):
	init_params = flax.core.unfreeze(init_params)
	init_params['output_projection'] = optimizers.tree_map_with_names(
	lambda p: jnp.full_like(p, config.init_head_bias),
	init_params['output_projection'],
	match_name_fn=lambda name: 'bias' in name,
	)
	init_params = flax.core.freeze(init_params)
	return init_params, init_model_state

	if not isinstance(rngs, dict):
	rngs = {'params': rngs}
	init_params, init_model_state = _initialize_model(rngs)
	# Pop out params rng:
	rngs.pop('params')

	# Count number of trainable parameters:
	num_trainable_params = debug_utils.log_param_shapes(init_params)

	# Count gflops:
	count_flops = config.get('count_flops', ml_collections.ConfigDict())
	if count_flops:
	variables = {'params': init_params, **init_model_state}
	gflops_dict = {}
	gflops_all = 0
	for kwargs, in_spec in input_spec.items():
	flops = debug_utils.compute_flops(
	flax_model_apply_fn=functools.partial(
	model_def.apply,
	variables,
	train=False,
	debug=False,
	rngs=rngs,
	**dict(kwargs),
	),
	input_spec=count_flops.get('input_spec', in_spec),
	fuse_multiply_add=count_flops.get('fuse_multiply_add', True),
	)
	gflops = flops / (10**9)
	gflops_key = 'gflops/' + '/'.join(f'{x}={y}' for x, y in kwargs)
	gflops_dict[gflops_key] = gflops
	gflops_all += gflops
	gflops_dict['gflops'] = gflops_all
	else:
	gflops_dict = None

	return init_params, init_model_state, num_trainable_params, gflops_dict


	def get_num_training_steps(
	config: ml_collections.ConfigDict, dataset_metadata: Dict[str, Any]
	) -> Tuple[int, Optional[int]]:
	"""Calculates the total number of training step and possibly steps_per_epoch.

	The main raining loop is based on number of training steps. Thus, for datasets
	that we want to train based on number of epochs, we need to calculate the
	total number of training steps. This function looks for `num_training_steps`
	in config, if it exists it returns that as the total step and `None` as
	`steps_per_epoch`. If num_training_steps doesn't exist, then it looks for
	`num_training_epochs` and given the size of training data calculates the total
	steps and steps_per_epoch. In this computation, we assume that
	drop_remainder=True.

	Args:
	config: Configuration of the experiment.
	dataset_metadata: Meta-data that is generated by the dataset_builder.

	Returns:
	total_steps: Total number of training steps.
	steps_per_epoch: Number of steps in every epoch.
	"""
	# We either use num_training_epochs or num_training_steps.
	steps_per_epoch = (
	dataset_metadata.get('num_train_examples', 0) // config.batch_size
	)

	if config.get('num_training_steps') is not None:
	assert not config.get('num_training_epochs')
	return config.num_training_steps, steps_per_epoch or None
	else:
	assert config.num_training_epochs and not config.get('num_training_steps')
	assert steps_per_epoch > 0, 'num_train_examples should be defined.'
	return int(steps_per_epoch * config.num_training_epochs), steps_per_epoch


	@functools.partial(jax.pmap, axis_name='x')
	def pmap_mean(x: PyTree) -> PyTree:
	# An axis_name is passed to pmap which can then be used by pmean.
	# In this case each device has its own version of the batch statistics and
	# we average them.
	return jax.lax.pmean(x, 'x')


	def sync_model_state_across_replicas(train_state: TrainState) -> TrainState:
	"""Sync the model_state (like batch statistics) across replicas.

	Args:
	train_state: TrainState; Current state of training.

	Returns:
	Updated state of training in which model_state is synced across replicas.
	"""
	# TODO(dehghani): We simply do "mean" here and this doesn't work with
	# statistics like variance. (check the discussion in Flax for fixing this).
	if jax.tree_util.tree_leaves(train_state.model_state):
	# If the model_state is not empty.
	new_model_state = flax.core.copy(
	train_state.model_state,
	{'batch_stats': pmap_mean(train_state.model_state['batch_stats'])},
	)
	return train_state.replace( # pytype: disable=attribute-error
	model_state=new_model_state
	)
	else:
	return train_state


	def save_checkpoint(
	workdir: str,
	train_state: TrainState,
	max_to_keep: int = 3,
	overwrite: bool = False,
	**kwargs,
	):
	"""Saves a checkpoint.

	Args:
	workdir: Experiment directory for saving the checkpoint.
	train_state: An instance of TrainState that holds the state of training.
	max_to_keep: The number of checkpoints to keep.
	overwrite: Overwrite existing checkpoint if a checkpoint at the current or
	a later step already exits (default: False).
	**kwargs: Passed on to flax.training.checkpoints.save_checkpoint.
	"""
	if jax.process_index() == 0:
	# Get train state from the first replica.
	checkpoint_state = jax.device_get(train_state)
	checkpoints.save_checkpoint(
	workdir,
	checkpoint_state,
	int(checkpoint_state.global_step),
	overwrite=overwrite,
	keep=max_to_keep,
	**kwargs,
	)


	SIGNED_FLOAT_RE = re.compile(r'([-+]?(?:\d+(?:\.\d*)?\|\.\d+)(?:[eE][-+]?\d+)?)')


	def checkpoint_path_step(path: str) -> Optional[float]:
	"""Returns the step number of a checkpoint path.

	Copied from flax/training/checkpoints.PyTree

	Args:
	path: The path to the checkpoint.

	Returns:
	The global step corresponding to that checkpoint, or None if it can't be
	determined.
	"""
	for s in SIGNED_FLOAT_RE.split(path)[::-1]:
	if SIGNED_FLOAT_RE.match(s):
	return float(s)
	return None


	def restore_checkpoint(
	checkpoint_path: str,
	train_state: Optional[TrainState] = None,
	assert_exist: bool = False,
	step: Optional[int] = None,
	) -> Tuple[TrainState, int]:
	"""Restores the last checkpoint.

	First restores the checkpoint, which is an instance of TrainState that holds
	the state of training.

	Args:
	checkpoint_path: Directory or filename to restore the checkpoint from.
	train_state: An instance of TrainState that holds the state of training.
	assert_exist: Assert that there is at least one checkpoint in the given
	path.
	step: Step number to load or None to load latest. If specified,
	checkpoint_path must be a directory.

	Returns:
	training state and an int which is the current step.
	"""
	if assert_exist:
	if 'checkpoint_' in checkpoint_path.split('/')[-1]:
	glob_path = checkpoint_path
	else:
	glob_path = os.path.join(checkpoint_path, 'checkpoint_*')
	if not gfile.glob(glob_path):
	raise ValueError(
	'No checkpoint for the pretrained model is found in: '
	f'{checkpoint_path}'
	)
	if train_state is None:
	raise ValueError(
	'Please use `restore_pretrained_checkpoint` for loading'
	'a checkpoint without providing a Scenic TrainState.'
	)
	train_state = checkpoints.restore_checkpoint(
	checkpoint_path, train_state, step
	)
	return train_state, int(train_state.global_step)


	def bind_rng_to_host_device(
	rng: jnp.ndarray,
	axis_name: Union[str, Tuple[str, ...]],
	bind_to: Optional[str] = None,
	) -> jnp.ndarray:
	"""Binds a rng to the host/device we are on.

	Must be called from within a pmapped function. Note that when binding to
	"device", we also bind the rng to hosts, as we fold_in the rng with axis_index
	which is unique for devices across all hosts.

	Args:
	rng: A jax.random.PRNGKey.
	axis_name: The axis of the devices we are binding rng across.
	bind_to: Must be one of the 'host' or 'device'. None means no binding.

	Returns:
	jax.random.PRNGKey specialized to host/device.
	"""
	if bind_to is None:
	return rng
	if bind_to == 'host':
	return jax.random.fold_in(rng, jax.process_index())
	elif bind_to == 'device':
	return jax.random.fold_in(rng, jax.lax.axis_index(axis_name))
	else:
	raise ValueError(
	"`bind_to` should be one of the `[None, 'host', 'device']`"
	)


	class TrainingDivergedError(Exception):
	pass


	def normalize_metrics_summary(
	metrics_summary: Dict[str, Tuple[float, int]], split: str
	) -> Dict[str, float]:
	"""Normalize the metrics in summary by its normalizer.

	Args:
	metrics_summary: A dictionary mapping metric name to (value, normalizer).
	split: Split for which we normalize the metrics. Used for logging.

	Returns:
	Normalized metrics summary.

	Raises:
	TrainingDivergedError: Due to observing a NaN in the metrics.
	"""
	# TODO(dehghani): Currently we only support metrics of the form 1/N sum
	# f(x_i). We may need a more general framework for metrics like
	# precision and recall. Note in particular that while we're normalizing by
	# the "metric normalization value" that is val[1], this value is previously
	# summed up and is defined to be an integer.
	normalized_metrics_summary = {}
	for key, val in metrics_summary.items():
	normalized_metrics_summary[key] = val[0] / (val[1] + 1e-9)
	if np.isnan(normalized_metrics_summary[key]):
	msg = f'NaN detected in {split}_{key} (Unnormalized values: {val})'
	if split == 'train':
	raise TrainingDivergedError(msg)
	else:
	logging.error('WARNING: Split %s %s', split, msg)

	return normalized_metrics_summary


	def stack_forest(forest: PyTree) -> PyTree:
	"""Transposes a list of dicts to dict of lists.

	For example,
	given
	[{'a':1,'b':2}, {'a':3,'b':4}],
	the output is:
	{'a': ([1, 3]), 'b': ([2, 4])}

	Args:
	forest: a list of dicts

	Returns:
	a dict of lists.
	"""
	if not forest:
	return {}

	stack_args = lambda *args: np.stack(args)
	return jax.tree_util.tree_map(stack_args, *forest)


	def unreplicate_and_get(x: PyTree) -> PyTree:
	return jax.device_get(jax_utils.unreplicate(x))


	def process_and_fetch_to_host(
	pred_or_tgt: Union[jnp.ndarray, Dict[str, jnp.ndarray]],
	batch_mask: jnp.ndarray,
	) -> Union[Sequence[jnp.ndarray], Dict[str, jnp.ndarray]]:
	"""Used to collect predictions and targets of the whole valid/test set.

	Args:
	pred_or_tgt: A jnp-array or dict of arrays, each of shape `[n_dev, bs,
	X,...,Y].
	batch_mask: A nd-array of shape `[nun_devices, bs]`, where zero values
	indicate padded examples.

	Returns:
	A list of length n_dev*bs of items, where each item is a dictionary with
	same keys as `pred_or_tgt` & values are normal np-arrays of shape [X,...,Y].
	"""

	def _split_mini_batches(x):
	# Fetch to host and filter out padded examples.
	x = jax.device_get(x)[np.array(batch_mask).astype(bool)]
	# Split minibatch of examples into a list of examples.
	x_list = jnp.split(x, x.shape[0], axis=0)
	# Squeeze out the dummy dimension.
	return jax.tree_util.tree_map(lambda x: jnp.squeeze(x, axis=0), x_list)

	pred_or_tgt = jax.tree_util.tree_map(_split_mini_batches, pred_or_tgt)

	if isinstance(pred_or_tgt, list):
	# Pred_or_tgt was a single array, so just return the list:
	return pred_or_tgt
	else:
	# Pred_or_tgt was dict of arrays, so convert dict of lists to list of dicts:
	keys, values = zip(*pred_or_tgt.items())
	return [dict(zip(keys, v)) for v in zip(*values)] # pytype: disable=bad-return-type # jax-ndarray


	@functools.partial(jax.pmap, axis_name='i')
	def _barrier(x):
	return jax.lax.psum(x, axis_name='i')


	def barrier():
	"""MPI-like barrier."""
	jax.device_get(_barrier(jnp.ones((jax.local_device_count(),))))


	def log_eval_summary(
	step: int,
	*,
	writer: metric_writers.MetricWriter,
	eval_metrics: Sequence[Dict[str, Tuple[float, int]]],
	extra_eval_summary: Optional[Mapping[str, float]] = None,
	metrics_normalizer_fn: Optional[
	Callable[[Dict[str, Tuple[float, int]], str], Dict[str, float]]
	] = None,
	prefix: str = 'valid',
	key_separator: str = '_',
	flush_writer: bool = True,
	) -> Dict[str, float]:
	"""Computes and logs eval metrics.

	Args:
	step: Current step.
	writer: Metric writer object.
	eval_metrics: List of dictionaries of calculated metrics. Usually the
	sequence is the concatenation of the per-eval-step metrics, and every
	dictionary maps a metric name to an array of (value, normalizer) - where
	the array index is usually the batch index.
	extra_eval_summary: A dict containing summaries that are already ready to be
	logged, e.g. global metrics from eval set, like precision/recall.
	metrics_normalizer_fn: Used for normalizing metrics. The API for this
	function is: `new_metrics_dict = metrics_normalizer_fn(metrics_dict,
	split)`. If set to None, we use the `normalize_metrics_summary` which uses
	the normalizer paired with each metric to normalize it (after summing both
	metric and normalizer values).
	prefix: str; Prefix added to the name of the summaries writen by this
	function.
	key_separator: Separator added between the prefix and key.
	flush_writer: If True, flush the writer after logging.

	Returns:
	A dictionary of metrics, mapping both `eval_metrics` and
	`extra_eval_summary` from metric name (incl. `prefix`) to float value.
	"""
	eval_metrics = stack_forest(eval_metrics)

	# Compute the sum over all examples in all batches.
	eval_metrics_summary = jax.tree_util.tree_map(lambda x: x.sum(), eval_metrics)
	# Normalize metrics by the total number of examples.
	metrics_normalizer_fn = metrics_normalizer_fn or normalize_metrics_summary
	eval_metrics_summary = metrics_normalizer_fn(eval_metrics_summary, 'eval')
	# If None, set to an empty dictionary.
	extra_eval_summary = extra_eval_summary or {}

	# Adds extra_eval_summary to the returned eval_summary.
	eval_metrics_summary.update(extra_eval_summary)

	writer.write_scalars(
	step,
	{
	key_separator.join((prefix, key)): val
	for key, val in eval_metrics_summary.items()
	},
	)

	if flush_writer:
	writer.flush()
	return eval_metrics_summary


	def log_train_summary(
	step: int,
	*,
	writer: metric_writers.MetricWriter,
	train_metrics: Sequence[Dict[str, Tuple[float, int]]],
	extra_training_logs: Optional[Sequence[Dict[str, Any]]] = None,
	metrics_normalizer_fn: Optional[
	Callable[[Dict[str, Tuple[float, int]], str], Dict[str, float]]
	] = None,
	prefix: str = 'train',
	key_separator: str = '_',
	flush_writer: bool = True,
	) -> Dict[str, float]:
	"""Computes and logs train metrics.

	Args:
	step: Current step.
	writer: Summary writer.
	train_metrics: List of dictionaries of calculated metrics. Usually the
	sequence is the concatenation of the per-eval-step metrics, and every
	dictionary maps a metric name to an array of (value, normalizer) - where
	the array index is usually the batch index.
	extra_training_logs: List of dictionaries, containing additional training
	logs, from every train step, e.g. learning rate, Time, num parameters,
	etc. Their mean will be logged.
	metrics_normalizer_fn: Used for normalizing metrics. The API for this
	function is: `new_metrics_dict = metrics_normalizer_fn(metrics_dict,
	split)`. If set to None, we use the normalize_metrics_summary which uses
	the normalizer paired with each metric to normalize it.
	prefix: str; Prefix added to the name of the summaries writen by this
	function.
	key_separator: Separator added between the prefix and key.
	flush_writer: If True, flush the writer after logging.

	Returns:
	A dictionary of metrics, mapping `train_metrics from metric name (incl.
	`prefix`) to float value.
	"""
	##### Prepare metrics:
	# Get metrics from devices:
	train_metrics = stack_forest(train_metrics)
	# Compute the sum over all examples in all batches:
	train_metrics_summary = jax.tree_util.tree_map(
	lambda x: x.sum(), train_metrics
	)
	# Normalize metrics by the total number of examples:
	metrics_normalizer_fn = metrics_normalizer_fn or normalize_metrics_summary
	train_metrics_summary = metrics_normalizer_fn(train_metrics_summary, 'train')

	##### Prepare additional training logs:
	# If None, set to an empty dictionary.
	extra_training_logs = extra_training_logs or [{}]
	train_logs = stack_forest(extra_training_logs)

	# Metrics:
	writer.write_scalars(
	step,
	{
	key_separator.join((prefix, key)): val
	for key, val in train_metrics_summary.items()
	},
	)
	# Additional logs:
	writer.write_scalars(
	step, {key: val.mean() for key, val in train_logs.items()}
	)

	if flush_writer:
	writer.flush()
	return train_metrics_summary


	def accumulate_gradients(
	compute_gradient_fn: Callable[
	[TrainState, Dict[str, jnp.ndarray], jnp.ndarray],
	Tuple[Any, jnp.ndarray],
	],
	metrics_fn: Callable[
	[jnp.ndarray, Dict[str, jnp.ndarray]], Dict[str, Tuple[float, int]]
	],
	train_state: TrainState,
	batch: Dict[str, jnp.ndarray],
	dropout_rng: jnp.ndarray,
	accum_steps: Optional[int],
	) -> Tuple[
	Optional[jnp.ndarray],
	jnp.ndarray,
	jnp.ndarray,
	Dict[str, Tuple[float, int]],
	]:
	"""Accumulate gradients over multiple steps.

	This enables training with larger effective batch sizes.
	Note that currently, gradient accumulation is not supported when the
	`model_state` is used, e.g., for models that have batch normalization and
	store batch statistics in the `model_state`.

	Note that if `accum_steps` <= 1 or is None, then the gradient of a single step
	is simply returned.

	Args:
	compute_gradient_fn: Gradient function, e.g., `jax.value_and_grad(
	training_loss_fn, ...).
	metrics_fn: A metrics function that given logits and batch of data,
	calculates the metrics.
	train_state: An instance of TrainState that has the parameters of the model,
	state of the model, etc.
	batch: A single batch of data. The buffer of this argument can be donated to
	the computation.
	dropout_rng: JAX rng key used for dropout.
	accum_steps: Number of accumulating steps (number of micro batches). When
	set to None or =<1, no accumulation is done.

	Returns:
	A tuple of model_state (e.g., batch statistics),
	computed gradients, training loss, and calculated metrics.
	"""
	params = train_state.params
	if accum_steps and accum_steps > 1:
	batch_size = next(iter(batch.values())).shape[0]
	microbatch_size = batch_size // accum_steps
	if batch_size % accum_steps != 0:
	raise ValueError(
	f'Bad accum_steps {accum_steps} for batch size {batch_size}'
	)
	logging.info(
	'Using microbatches: %d microbatches, %d size',
	accum_steps,
	microbatch_size,
	)

	def get_microbatch(
	batch: Dict[str, jnp.ndarray], idx: int
	) -> Dict[str, jnp.ndarray]:
	"""Fetch microbatch slice from the given batch."""
	return jax.tree_util.tree_map(
	lambda x: x.reshape((-1, microbatch_size) + x.shape[1:])[idx], batch
	)

	def per_microbatch_compute_gradient_fn(
	loop_cnt: int,
	loop_state: Tuple[
	jnp.ndarray, jnp.ndarray, jnp.ndarray, Dict[str, Tuple[float, int]]
	],
	) -> Tuple[
	jnp.ndarray, jnp.ndarray, Dict[str, Tuple[float, int]], jnp.ndarray
	]:
	dropout_rng, grad_accum, train_loss_acc, metrics_acc = loop_state
	dropout_rng, sub_dropout_rng = jax.random.split(dropout_rng)
	mbatch = get_microbatch(batch, loop_cnt)
	(train_loss, (_, mlogits)), grad = compute_gradient_fn(
	params, mbatch, sub_dropout_rng
	)
	metrics = metrics_fn(mlogits, mbatch)
	# Accumulate gradients and metrics.
	grad = jax.tree_util.tree_map(jnp.add, grad_accum, grad)
	metrics = jax.tree_util.tree_map(jnp.add, metrics, metrics_acc)
	train_loss = jax.tree_util.tree_map(jnp.add, train_loss, train_loss_acc)
	return dropout_rng, grad, train_loss, metrics

	# Initialize gradient accumulation loop state.
	dropout_rng, sub_dropout_rng = jax.random.split(dropout_rng)
	init_mbatch = get_microbatch(batch, 0)
	(init_train_loss, (model_state, init_logits)), grad_init = (
	compute_gradient_fn(params, init_mbatch, sub_dropout_rng)
	)
	if jax.tree_util.tree_leaves(model_state):
	# If the model_state is not empty.
	raise ValueError(
	'Gradient accumulation is not supported when the '
	'model_state is in used (e.g. models w/ batch norm).'
	)

	metrics_init = metrics_fn(init_logits, init_mbatch)
	del init_logits, init_mbatch

	# Run gradient accumulation loop.
	loop_init = (dropout_rng, grad_init, init_train_loss, metrics_init)
	_, grad_acc, train_loss, metrics_acc = jax.lax.fori_loop(
	1, accum_steps, per_microbatch_compute_gradient_fn, loop_init
	)
	grad_acc = jax.tree_util.tree_map(lambda x: x / accum_steps, grad_acc)
	train_loss = jax.tree_util.tree_map(lambda x: x / accum_steps, train_loss)
	return model_state, grad_acc, train_loss, metrics_acc
	else:
	(train_loss, (model_state, logits)), grad = compute_gradient_fn(
	params, batch, dropout_rng
	)
	metrics = metrics_fn(logits, batch)
	return model_state, grad, train_loss, metrics


	class Chrono:
	"""Measures time and reports progress.

	This is a modified fork of Chrono class from big_vision codebase:
	https://github.com/google-research/big_vision/blob/main/big_vision/utils.py

	Some concepts:
	1. This differentiates between three "types" of time:
	- training time: the time spent on actual training (fprop/bprop/update)
	- program time: overall time the program runs, including all overheads
	- pause time: the chronometer can be paused (eg during evals).
	2. This handles a "warmup": the first step is skipped for training time
	purposes, as it includes significant compilation overheads, which distort
	estimates.
	3. `accumulates` (i.e. integrates) timings, and saves/loads them across
	restarts.
	"""

	def __init__(self, example_type: str = 'img', warmup: int = 2):
	self.program_start_time = time.monotonic()
	self.train_start_time = None
	self.train_start_step = None # When we started timing (after warmup)

	self.prev_time = None
	self.prev_step = None

	self.pause_start = None
	self.paused_time = 0

	self.warmup = warmup # How many calls to `tick` to skip.
	self.load() # Inits accum integrators.
	self.note = 'Chrono n/a'
	self.example_type = example_type

	def inform(
	self,
	first_step: int,
	total_steps: int,
	global_bs: int,
	steps_per_epoch: int,
	):
	"""Provide some extra info that's only known later in the program."""
	self.prev_step = copy.deepcopy(first_step)
	self.first_step = copy.deepcopy(first_step)
	self.total_steps = total_steps
	self.steps_per_epoch = steps_per_epoch
	self.global_bs = global_bs
	if total_steps:
	self.note = (
	f'Steps:{first_step}/{total_steps} [{first_step/total_steps:.1%}]'
	)

	def tick(
	self,
	step: int,
	writer: metric_writers.MetricWriter,
	write_note: Callable[[str], None],
	):
	"""A chronometer tick."""
	summary = {}

	def hms(s):
	"""Format time in hours/minutes/seconds."""
	if s < 60:
	return f'{s:.0f}s'
	m, s = divmod(s, 60)
	if m < 60:
	return f'{m:.0f}m{s:.0f}s'
	h, m = divmod(m, 60)
	return f'{h:.0f}h{m:.0f}m' # Seconds intentionally omitted.

	now = time.monotonic()
	summary.update({'uptime': now - self.program_start_time})
	# We always count examples, regardless of the timing-related warmup that
	# happens a few lines below.
	ds = step - self.prev_step # Steps between ticks
	self.prev_step = step
	self.accum_examples_seen += ds * self.global_bs
	summary.update({'examples_seen': self.accum_examples_seen})
	if self.steps_per_epoch:
	summary.update({'epoch': step / self.steps_per_epoch})

	# We take the start as the second time `tick` is called, so we avoid
	# measuring the overhead of compilation and don't include it in time
	# estimates.
	if self.warmup > 1:
	self.warmup -= 1
	write_note(self.note) # This can help debugging.
	return
	if self.warmup == 1:
	self.train_start_time = self.prev_time = now
	self.train_start_step = step
	self.accum_program_time += now - self.program_start_time
	self.paused_time = 0 # Drop pauses that happened before timing starts.
	self.warmup = 0
	write_note(self.note) # This can help debugging.
	return

	# Measurement with micro-timings of current training steps speed.
	# Time between ticks (ignoring pause)
	if self.prev_time is None:
	raise ValueError('prev_time is None, possible warmup was skipped')
	dt = now - self.prev_time - self.paused_time
	ncores = jax.device_count() # Global device count
	summary.update({
	f'{self.example_type}/sec/core': self.global_bs * ds / dt / ncores,
	f'{self.example_type}/sec': self.global_bs * ds / dt,
	})

	# Accumulate (integrate) times, good for plots.
	self.accum_train_time += dt
	self.accum_pause_time += self.paused_time
	self.accum_program_time += dt + self.paused_time

	# Convert to, and log as, core hours.
	core_hours = self.accum_train_time * ncores / 60 / 60
	devtype = jax.devices()[0].device_kind
	summary.update({
	f'core_hours_{devtype}': core_hours,
	'core_hours': core_hours, # For convenience as x-axis in sweeps.
	})

	# Progress note with "global" full-program average timings
	# (eg in program-time minus warmup)
	dt = now - self.train_start_time # Time elapsed since end of warmup.
	steps_timed = step - self.train_start_step
	steps_todo = self.total_steps - step
	self.note = f'Steps:{step}/{self.total_steps} [{step/self.total_steps:.1%}]'
	self.note += f'\nWalltime:{hms(self.accum_program_time)}'
	self.note += f' ({hms(self.accum_pause_time)} Not-train)'
	self.note += f'\nETA:{hms(dt / steps_timed * steps_todo)}'
	self.note += (
	f'\nTotal train time:{hms(dt / steps_timed * self.total_steps)}'
	)
	write_note(self.note)
	writer.write_scalars(step, summary)
	self.prev_time = now
	self.paused_time = 0

	def pause(self, wait_for=()):
	assert self.pause_start is None, "Don't pause twice."
	jax.block_until_ready(wait_for)
	self.pause_start = time.monotonic()

	def resume(self):
	assert self.pause_start is not None, 'Cannot resume without pausing first.'
	self.paused_time += time.monotonic() - self.pause_start
	self.pause_start = None

	def save(self):
	return dict(
	accum_program_time=self.accum_program_time,
	accum_train_time=self.accum_train_time,
	accum_pause_time=self.accum_pause_time,
	accum_examples_seen=self.accum_examples_seen,
	)

	def load(self, ckpt={}): # pylint: disable=dangerous-default-value
	self.accum_program_time = ckpt.get('accum_program_time', 0.0)
	self.accum_train_time = ckpt.get('accum_train_time', 0.0)
	self.accum_pause_time = ckpt.get('accum_pause_time', 0.0)
	self.accum_examples_seen = ckpt.get('accum_examples_seen', 0)


	def barrier_across_hosts():
	"""Ensure all hosts stay up until the end, otherwise the program may hang."""
	if jax.process_count() > 1:
	x = jnp.ones([jax.local_device_count()])
	x = jax.device_get(jax.pmap(lambda x: jax.lax.psum(x, 'i'), 'i')(x))
	assert x[0] == jax.device_count()


	def handle_checkpointing(
	train_state: TrainState,
	chrono: Chrono,
	workdir: str,
	max_checkpoints_to_keep=3,
	):
	"""Handles all the bookkeeping around checkpointing.

	Syncs the training state and unreplicates it, stops & restarts Chrono
	(and handles its metadata) and writes the actual checkpoint.

	Args:
	train_state: A replicated TrainState.
	chrono: The Chrono object.
	workdir: the workdir of the process.
	max_checkpoints_to_keep: how many checkpoints to keep.
	"""
	train_state = sync_model_state_across_replicas(train_state)
	if jax.process_index() == 0:
	unrep_train_state = jax_utils.unreplicate(train_state)
	metadata = unrep_train_state.metadata
	metadata['chrono'] = chrono.save()
	unrep_train_state = unrep_train_state.replace(metadata=metadata)
	save_checkpoint(
	workdir, unrep_train_state, max_to_keep=max_checkpoints_to_keep
	)
	del unrep_train_state