|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| """Creating the task and start trainer."""
|
|
|
| import pprint
|
|
|
| from absl import app
|
| from absl import flags
|
| from absl import logging
|
| import gin
|
| from official.common import distribute_utils
|
| from official.common import flags as tfm_flags
|
| from official.core import config_definitions as cfg
|
| from official.core import train_utils
|
| from official.modeling import hyperparams
|
| from official.modeling import optimization
|
| from official.modeling import performance
|
| from official.modeling.fast_training.progressive import train_lib
|
| from official.modeling.fast_training.progressive import trainer as prog_trainer_lib
|
| from official.nlp.data import pretrain_dataloader
|
| from official.projects.mobilebert import distillation
|
|
|
|
|
| FLAGS = flags.FLAGS
|
|
|
| optimization_config = optimization.OptimizationConfig(
|
| optimizer=optimization.OptimizerConfig(
|
| type='lamb',
|
| lamb=optimization.LAMBConfig(
|
| weight_decay_rate=0.01,
|
| exclude_from_weight_decay=['LayerNorm', 'bias', 'norm'],
|
| clipnorm=1.0)),
|
| learning_rate=optimization.LrConfig(
|
| type='polynomial',
|
| polynomial=optimization.PolynomialLrConfig(
|
| initial_learning_rate=1.5e-3,
|
| decay_steps=10000,
|
| end_learning_rate=1.5e-3)),
|
| warmup=optimization.WarmupConfig(
|
| type='linear',
|
| linear=optimization.LinearWarmupConfig(warmup_learning_rate=0)))
|
|
|
|
|
|
|
| def config_override(params, flags_obj):
|
| """Override ExperimentConfig according to flags."""
|
|
|
| params.override({
|
| 'runtime': {
|
| 'tpu': flags_obj.tpu,
|
| }
|
| })
|
|
|
|
|
|
|
|
|
| for config_file in flags_obj.config_file or []:
|
| params = hyperparams.override_params_dict(
|
| params, config_file, is_strict=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
| if flags_obj.params_override:
|
| params = hyperparams.override_params_dict(
|
| params, flags_obj.params_override, is_strict=True)
|
|
|
| params.validate()
|
| params.lock()
|
|
|
| pp = pprint.PrettyPrinter()
|
| logging.info('Final experiment parameters: %s', pp.pformat(params.as_dict()))
|
|
|
| model_dir = flags_obj.model_dir
|
| if 'train' in flags_obj.mode:
|
|
|
|
|
| train_utils.serialize_config(params, model_dir)
|
|
|
| return params
|
|
|
|
|
| def get_exp_config():
|
| """Get ExperimentConfig."""
|
| params = cfg.ExperimentConfig(
|
| task=distillation.BertDistillationTaskConfig(
|
| train_data=pretrain_dataloader.BertPretrainDataConfig(),
|
| validation_data=pretrain_dataloader.BertPretrainDataConfig(
|
| is_training=False)),
|
| trainer=prog_trainer_lib.ProgressiveTrainerConfig(
|
| progressive=distillation.BertDistillationProgressiveConfig(),
|
| optimizer_config=optimization_config,
|
| train_steps=740000,
|
| checkpoint_interval=20000))
|
|
|
| return config_override(params, FLAGS)
|
|
|
|
|
| def main(_):
|
| logging.info('Parsing config files...')
|
| gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params)
|
| params = get_exp_config()
|
|
|
|
|
|
|
|
|
|
|
| if params.runtime.mixed_precision_dtype:
|
| performance.set_mixed_precision_policy(params.runtime.mixed_precision_dtype)
|
| distribution_strategy = distribute_utils.get_distribution_strategy(
|
| distribution_strategy=params.runtime.distribution_strategy,
|
| all_reduce_alg=params.runtime.all_reduce_alg,
|
| num_gpus=params.runtime.num_gpus,
|
| tpu_address=params.runtime.tpu)
|
|
|
| with distribution_strategy.scope():
|
| task = distillation.BertDistillationTask(
|
| strategy=distribution_strategy,
|
| progressive=params.trainer.progressive,
|
| optimizer_config=params.trainer.optimizer_config,
|
| task_config=params.task)
|
|
|
| train_lib.run_experiment(
|
| distribution_strategy=distribution_strategy,
|
| task=task,
|
| mode=FLAGS.mode,
|
| params=params,
|
| model_dir=FLAGS.model_dir)
|
|
|
| if __name__ == '__main__':
|
| tfm_flags.define_flags()
|
| app.run(main)
|
|
|