Spaces:

Kasamuday
/

object

Runtime error

App Files Files Community

object / models /official /utils /flags /_performance.py

Kasamuday

Upload 4260 files

f3507ef verified almost 2 years ago

raw

history blame contribute delete

11.9 kB

	# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""Register flags for optimizing performance."""

	import multiprocessing

	from absl import flags # pylint: disable=g-bad-import-order
	import tensorflow as tf, tf_keras # pylint: disable=g-bad-import-order

	from official.utils.flags._conventions import help_wrap

	# Map string to TensorFlow dtype
	DTYPE_MAP = {
	"fp16": tf.float16,
	"bf16": tf.bfloat16,
	"fp32": tf.float32,
	}


	def get_tf_dtype(flags_obj):
	if getattr(flags_obj, "fp16_implementation", None) == "graph_rewrite":
	# If the graph_rewrite is used, we build the graph with fp32, and let the
	# graph rewrite change ops to fp16.
	return tf.float32
	return DTYPE_MAP[flags_obj.dtype]


	def get_loss_scale(flags_obj, default_for_fp16):
	dtype = get_tf_dtype(flags_obj)
	if flags_obj.loss_scale == "dynamic":
	return flags_obj.loss_scale
	elif flags_obj.loss_scale is not None:
	return float(flags_obj.loss_scale)
	elif dtype == tf.float32 or dtype == tf.bfloat16:
	return 1 # No loss scaling is needed for fp32
	else:
	assert dtype == tf.float16
	return default_for_fp16


	def define_performance(num_parallel_calls=False,
	inter_op=False,
	intra_op=False,
	synthetic_data=False,
	max_train_steps=False,
	dtype=False,
	all_reduce_alg=False,
	num_packs=False,
	tf_gpu_thread_mode=False,
	datasets_num_private_threads=False,
	datasets_num_parallel_batches=False,
	fp16_implementation=False,
	loss_scale=False,
	tf_data_experimental_slack=False,
	enable_xla=False,
	training_dataset_cache=False):
	"""Register flags for specifying performance tuning arguments.

	Args:
	num_parallel_calls: Create a flag to specify parallelism of data loading.
	inter_op: Create a flag to allow specification of inter op threads.
	intra_op: Create a flag to allow specification of intra op threads.
	synthetic_data: Create a flag to allow the use of synthetic data.
	max_train_steps: Create a flags to allow specification of maximum number of
	training steps
	dtype: Create flags for specifying dtype.
	all_reduce_alg: If set forces a specific algorithm for multi-gpu.
	num_packs: If set provides number of packs for MirroredStrategy's cross
	device ops.
	tf_gpu_thread_mode: gpu_private triggers us of private thread pool.
	datasets_num_private_threads: Number of private threads for datasets.
	datasets_num_parallel_batches: Determines how many batches to process in
	parallel when using map and batch from tf.data.
	fp16_implementation: Create fp16_implementation flag.
	loss_scale: Controls the loss scaling, normally for mixed-precision
	training. Can only be turned on if dtype is also True.
	tf_data_experimental_slack: Determines whether to enable tf.data's
	`experimental_slack` option.
	enable_xla: Determines if XLA (auto clustering) is turned on.
	training_dataset_cache: Whether to cache the training dataset on workers.
	Typically used to improve training performance when training data is in
	remote storage and can fit into worker memory.

	Returns:
	A list of flags for core.py to marks as key flags.
	"""

	key_flags = []
	if num_parallel_calls:
	flags.DEFINE_integer(
	name="num_parallel_calls",
	short_name="npc",
	default=multiprocessing.cpu_count(),
	help=help_wrap("The number of records that are processed in parallel "
	"during input processing. This can be optimized per "
	"data set but for generally homogeneous data sets, "
	"should be approximately the number of available CPU "
	"cores. (default behavior)"))

	if inter_op:
	flags.DEFINE_integer(
	name="inter_op_parallelism_threads",
	short_name="inter",
	default=0,
	help=help_wrap("Number of inter_op_parallelism_threads to use for CPU. "
	"See TensorFlow config.proto for details."))

	if intra_op:
	flags.DEFINE_integer(
	name="intra_op_parallelism_threads",
	short_name="intra",
	default=0,
	help=help_wrap("Number of intra_op_parallelism_threads to use for CPU. "
	"See TensorFlow config.proto for details."))

	if synthetic_data:
	flags.DEFINE_bool(
	name="use_synthetic_data",
	short_name="synth",
	default=False,
	help=help_wrap(
	"If set, use fake data (zeroes) instead of a real dataset. "
	"This mode is useful for performance debugging, as it removes "
	"input processing steps, but will not learn anything."))

	if max_train_steps:
	flags.DEFINE_integer(
	name="max_train_steps",
	short_name="mts",
	default=None,
	help=help_wrap(
	"The model will stop training if the global_step reaches this "
	"value. If not set, training will run until the specified number "
	"of epochs have run as usual. It is generally recommended to set "
	"--train_epochs=1 when using this flag."))

	if dtype:
	flags.DEFINE_enum(
	name="dtype",
	short_name="dt",
	default="fp32",
	enum_values=DTYPE_MAP.keys(),
	help=help_wrap("The TensorFlow datatype used for calculations. "
	"For 16-bit dtypes, variables and certain ops will "
	"still be float32 for numeric stability."))

	if loss_scale:
	flags.DEFINE_string(
	name="loss_scale",
	short_name="ls",
	default=None,
	help=help_wrap(
	"The amount to scale the loss by when --dtype=fp16. This can be "
	"an int/float or the string 'dynamic'. Before gradients are "
	"computed, the loss is multiplied by the loss scale, making all "
	"gradients loss_scale times larger. To adjust for this, "
	"gradients are divided by the loss scale before being applied to "
	"variables. This is mathematically equivalent to training "
	"without a loss scale, but the loss scale helps avoid some "
	"intermediate gradients from underflowing to zero. The default "
	"is 'dynamic', which dynamic determines the optimal loss scale "
	"during training."))

	# pylint: disable=unused-variable
	@flags.validator(
	flag_name="loss_scale",
	message="loss_scale should be a positive int/float or the string "
	"'dynamic'.")
	def _check_loss_scale(loss_scale):
	"""Validator to check the loss scale flag is valid."""
	if loss_scale is None:
	return True # null case is handled in get_loss_scale()

	if loss_scale == "dynamic":
	return True

	try:
	loss_scale = float(loss_scale)
	except ValueError:
	return False

	return loss_scale > 0
	# pylint: enable=unused-variable

	if fp16_implementation:
	flags.DEFINE_enum(
	name="fp16_implementation",
	default="keras",
	enum_values=("keras", "graph_rewrite"),
	help=help_wrap(
	"When --dtype=fp16, how fp16 should be implemented. This has no "
	"impact on correctness. 'keras' uses the "
	"tf_keras.mixed_precision API. 'graph_rewrite' uses the "
	"tf.compat.v1.mixed_precision."
	"enable_mixed_precision_graph_rewrite API."))

	@flags.multi_flags_validator(
	["fp16_implementation", "dtype", "loss_scale"])
	def _check_fp16_implementation(flags_dict):
	"""Validator to check fp16_implementation flag is valid."""
	if (flags_dict["fp16_implementation"] == "graph_rewrite" and
	flags_dict["dtype"] != "fp16"):
	raise flags.ValidationError("--fp16_implementation should not be "
	"specified unless --dtype=fp16")
	return True

	if all_reduce_alg:
	flags.DEFINE_string(
	name="all_reduce_alg",
	short_name="ara",
	default=None,
	help=help_wrap("Defines the algorithm to use for performing all-reduce."
	"When specified with MirroredStrategy for single "
	"worker, this controls "
	"tf.contrib.distribute.AllReduceCrossTowerOps. When "
	"specified with MultiWorkerMirroredStrategy, this "
	"controls "
	"tf.distribute.experimental.CollectiveCommunication; "
	"valid options are `ring` and `nccl`."))

	if num_packs:
	flags.DEFINE_integer(
	name="num_packs",
	default=1,
	help=help_wrap("Sets `num_packs` in the cross device ops used in "
	"MirroredStrategy. For details, see "
	"tf.distribute.NcclAllReduce."))

	if tf_gpu_thread_mode:
	flags.DEFINE_string(
	name="tf_gpu_thread_mode",
	short_name="gt_mode",
	default=None,
	help=help_wrap(
	"Whether and how the GPU device uses its own threadpool."))

	flags.DEFINE_integer(
	name="per_gpu_thread_count",
	short_name="pgtc",
	default=0,
	help=help_wrap("The number of threads to use for GPU. Only valid when "
	"tf_gpu_thread_mode is not global."))

	if datasets_num_private_threads:
	flags.DEFINE_integer(
	name="datasets_num_private_threads",
	default=None,
	help=help_wrap(
	"Number of threads for a private threadpool created for all"
	"datasets computation.."))

	if datasets_num_parallel_batches:
	flags.DEFINE_integer(
	name="datasets_num_parallel_batches",
	default=None,
	help=help_wrap(
	"Determines how many batches to process in parallel when using "
	"map and batch from tf.data."))

	if training_dataset_cache:
	flags.DEFINE_boolean(
	name="training_dataset_cache",
	default=False,
	help=help_wrap(
	"Determines whether to cache the training dataset on workers. "
	"Typically used to improve training performance when training "
	"data is in remote storage and can fit into worker memory."))

	if tf_data_experimental_slack:
	flags.DEFINE_boolean(
	name="tf_data_experimental_slack",
	default=False,
	help=help_wrap(
	"Whether to enable tf.data's `experimental_slack` option."))

	if enable_xla:
	flags.DEFINE_boolean(
	name="enable_xla",
	default=False,
	help="Whether to enable XLA auto jit compilation")

	return key_flags