|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| r"""Creates and runs TF2 object detection models.
|
|
|
| For local training/evaluation run:
|
| PIPELINE_CONFIG_PATH=path/to/pipeline.config
|
| MODEL_DIR=/tmp/model_outputs
|
| NUM_TRAIN_STEPS=10000
|
| SAMPLE_1_OF_N_EVAL_EXAMPLES=1
|
| python model_main_tf2.py -- \
|
| --model_dir=$MODEL_DIR --num_train_steps=$NUM_TRAIN_STEPS \
|
| --sample_1_of_n_eval_examples=$SAMPLE_1_OF_N_EVAL_EXAMPLES \
|
| --pipeline_config_path=$PIPELINE_CONFIG_PATH \
|
| --alsologtostderr
|
| """
|
| from absl import flags
|
| import tensorflow.compat.v2 as tf
|
| from object_detection import model_lib_v2
|
|
|
| flags.DEFINE_string('pipeline_config_path', None, 'Path to pipeline config '
|
| 'file.')
|
| flags.DEFINE_integer('num_train_steps', None, 'Number of train steps.')
|
| flags.DEFINE_bool('eval_on_train_data', False, 'Enable evaluating on train '
|
| 'data (only supported in distributed training).')
|
| flags.DEFINE_integer('sample_1_of_n_eval_examples', None, 'Will sample one of '
|
| 'every n eval input examples, where n is provided.')
|
| flags.DEFINE_integer('sample_1_of_n_eval_on_train_examples', 5, 'Will sample '
|
| 'one of every n train input examples for evaluation, '
|
| 'where n is provided. This is only used if '
|
| '`eval_training_data` is True.')
|
| flags.DEFINE_string(
|
| 'model_dir', None, 'Path to output model directory '
|
| 'where event and checkpoint files will be written.')
|
| flags.DEFINE_string(
|
| 'checkpoint_dir', None, 'Path to directory holding a checkpoint. If '
|
| '`checkpoint_dir` is provided, this binary operates in eval-only mode, '
|
| 'writing resulting metrics to `model_dir`.')
|
|
|
| flags.DEFINE_integer('eval_timeout', 3600, 'Number of seconds to wait for an'
|
| 'evaluation checkpoint before exiting.')
|
|
|
| flags.DEFINE_bool('use_tpu', False, 'Whether the job is executing on a TPU.')
|
| flags.DEFINE_string(
|
| 'tpu_name',
|
| default=None,
|
| help='Name of the Cloud TPU for Cluster Resolvers.')
|
| flags.DEFINE_integer(
|
| 'num_workers', 1, 'When num_workers > 1, training uses '
|
| 'MultiWorkerMirroredStrategy. When num_workers = 1 it uses '
|
| 'MirroredStrategy.')
|
| flags.DEFINE_integer(
|
| 'checkpoint_every_n', 1000, 'Integer defining how often we checkpoint.')
|
| flags.DEFINE_boolean('record_summaries', True,
|
| ('Whether or not to record summaries defined by the model'
|
| ' or the training pipeline. This does not impact the'
|
| ' summaries of the loss values which are always'
|
| ' recorded.'))
|
|
|
| FLAGS = flags.FLAGS
|
|
|
|
|
| def main(unused_argv):
|
| flags.mark_flag_as_required('model_dir')
|
| flags.mark_flag_as_required('pipeline_config_path')
|
| tf.config.set_soft_device_placement(True)
|
|
|
| if FLAGS.checkpoint_dir:
|
| model_lib_v2.eval_continuously(
|
| pipeline_config_path=FLAGS.pipeline_config_path,
|
| model_dir=FLAGS.model_dir,
|
| train_steps=FLAGS.num_train_steps,
|
| sample_1_of_n_eval_examples=FLAGS.sample_1_of_n_eval_examples,
|
| sample_1_of_n_eval_on_train_examples=(
|
| FLAGS.sample_1_of_n_eval_on_train_examples),
|
| checkpoint_dir=FLAGS.checkpoint_dir,
|
| wait_interval=300, timeout=FLAGS.eval_timeout)
|
| else:
|
| if FLAGS.use_tpu:
|
|
|
|
|
| resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
|
| FLAGS.tpu_name)
|
| tf.config.experimental_connect_to_cluster(resolver)
|
| tf.tpu.experimental.initialize_tpu_system(resolver)
|
| strategy = tf.distribute.experimental.TPUStrategy(resolver)
|
| elif FLAGS.num_workers > 1:
|
| strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
|
| else:
|
| strategy = tf.compat.v2.distribute.MirroredStrategy()
|
|
|
| with strategy.scope():
|
| model_lib_v2.train_loop(
|
| pipeline_config_path=FLAGS.pipeline_config_path,
|
| model_dir=FLAGS.model_dir,
|
| train_steps=FLAGS.num_train_steps,
|
| use_tpu=FLAGS.use_tpu,
|
| checkpoint_every_n=FLAGS.checkpoint_every_n,
|
| record_summaries=FLAGS.record_summaries)
|
|
|
| if __name__ == '__main__':
|
| tf.compat.v1.app.run()
|
|
|