nvan13's picture
Add files using upload-large-folder tool
ab0f6ec verified
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
#
# Author: Pengcheng He (penhe@microsoft.com)
# Date: 05/15/2019
#
""" Arguments for optimizer
"""
import argparse
from ..utils import boolean_string
__all__ = ['get_args']
def get_args():
parser=argparse.ArgumentParser(add_help=False, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
group = parser.add_argument_group(title='Optimizer', description='Parameters for the distributed optimizer')
group.add_argument('--fp16',
default=False,
type=boolean_string,
help="Whether to use 16-bit float precision instead of 32-bit")
group.add_argument('--loss_scale',
type=float, default=16384,
help='Loss scaling, positive power of 2 values can improve fp16 convergence.')
group.add_argument('--scale_steps',
type=int, default=250,
help='The steps to wait to increase the loss scale.')
group.add_argument('--lookahead_k',
default=-1,
type=int,
help="lookahead k parameter")
group.add_argument('--lookahead_alpha',
default=0.5,
type=float,
help="lookahead alpha parameter")
group.add_argument('--with_radam',
default=False,
type=boolean_string,
help="whether to use RAdam")
group.add_argument('--opt_type',
type=str.lower,
default='adam',
choices=['adam', 'admax'],
help="The optimizer to be used.")
group.add_argument("--warmup_proportion",
default=0.1,
type=float,
help="Proportion of training to perform linear learning rate warmup for. "
"E.g., 0.1 = 10%% of training.")
group.add_argument("--lr_schedule_ends",
default=0,
type=float,
help="The ended learning rate scale for learning rate scheduling")
group.add_argument("--lr_schedule",
default='warmup_linear',
type=str,
help="The learning rate scheduler used for traning. " +
"E.g. warmup_linear, warmup_linear_shift, warmup_cosine, warmup_constant. Default, warmup_linear")
group.add_argument("--max_grad_norm",
default=1,
type=float,
help="The clip threshold of global gradient norm")
group.add_argument("--learning_rate",
default=5e-5,
type=float,
help="The initial learning rate for Adam.")
group.add_argument("--epsilon",
default=1e-6,
type=float,
help="epsilon setting for Adam.")
group.add_argument("--adam_beta1",
default=0.9,
type=float,
help="The beta1 parameter for Adam.")
group.add_argument("--adam_beta2",
default=0.999,
type=float,
help="The beta2 parameter for Adam.")
group.add_argument('--weight_decay',
type=float,
default=0.01,
help="The weight decay rate")
return parser