Spaces:

Oopstom
/

ReactSeq

Runtime error

App Files Files Community

ReactSeq / onmt /opts.py

Oopstom

Upload 313 files

c668e80 verified over 1 year ago

raw

history blame contribute delete

54.4 kB

	""" Implementation of all available options """
	import configargparse

	from onmt.modules.sru import CheckSRU
	from onmt.transforms import AVAILABLE_TRANSFORMS
	from onmt.constants import ModelTask
	from onmt.modules.position_ffn import ACTIVATION_FUNCTIONS
	from onmt.modules.position_ffn import ActivationFunction
	from onmt.constants import DefaultTokens


	def config_opts(parser):
	group = parser.add_argument_group("Configuration")
	group.add(
	"-config",
	"--config",
	required=False,
	is_config_file_arg=True,
	help="Path of the main YAML config file.",
	)
	group.add(
	"-save_config",
	"--save_config",
	required=False,
	is_write_out_config_file_arg=True,
	help="Path where to save the config.",
	)


	def _add_logging_opts(parser, is_train=True):
	group = parser.add_argument_group("Logging")
	group.add(
	"--log_file",
	"-log_file",
	type=str,
	default="",
	help="Output logs to a file under this path.",
	)
	group.add(
	"--log_file_level",
	"-log_file_level",
	type=str,
	action=StoreLoggingLevelAction,
	choices=StoreLoggingLevelAction.CHOICES,
	default="0",
	)
	group.add(
	"--verbose",
	"-verbose",
	action="store_true",
	help="Print data loading and statistics for all process"
	"(default only log the first process shard)"
	if is_train
	else "Print scores and predictions for each sentence",
	)

	if is_train:
	group.add(
	"--valid_metrics",
	"-valid_metrics",
	default=[],
	nargs="+",
	help="List of names of additional validation metrics",
	)
	group.add(
	"--scoring_debug",
	"-scoring_debug",
	action="store_true",
	help="Dump the src/ref/pred of the current batch",
	)
	group.add(
	"--dump_preds",
	"-dump_preds",
	type=str,
	default=None,
	help="Folder to dump predictions to.",
	)
	group.add(
	"--report_every",
	"-report_every",
	type=int,
	default=50,
	help="Print stats at this interval.",
	)
	group.add(
	"--exp_host",
	"-exp_host",
	type=str,
	default="",
	help="Send logs to this crayon server.",
	)
	group.add(
	"--exp",
	"-exp",
	type=str,
	default="",
	help="Name of the experiment for logging.",
	)
	# Use Tensorboard for visualization during training
	group.add(
	"--tensorboard",
	"-tensorboard",
	action="store_true",
	help="Use tensorboard for visualization during training. "
	"Must have the library tensorboard >= 1.14.",
	)
	group.add(
	"--tensorboard_log_dir",
	"-tensorboard_log_dir",
	type=str,
	default="runs/onmt",
	help="Log directory for Tensorboard. " "This is also the name of the run.",
	)
	group.add(
	"--override_opts",
	"-override-opts",
	action="store_true",
	help="Allow to override some checkpoint opts",
	)
	else:
	# Options only during inference
	group.add(
	"--attn_debug",
	"-attn_debug",
	action="store_true",
	help="Print best attn for each word",
	)
	group.add(
	"--align_debug",
	"-align_debug",
	action="store_true",
	help="Print best align for each word",
	)
	group.add(
	"--dump_beam",
	"-dump_beam",
	type=str,
	default="",
	help="File to dump beam information to.",
	)
	group.add(
	"--n_best",
	"-n_best",
	type=int,
	default=1,
	help="If verbose is set, will output the n_best " "decoded sentences",
	)
	group.add(
	"--with_score",
	"-with_score",
	action="store_true",
	help="add a tab separated score to the translation",
	)


	def _add_reproducibility_opts(parser):
	group = parser.add_argument_group("Reproducibility")
	group.add(
	"--seed",
	"-seed",
	type=int,
	default=-1,
	help="Set random seed used for better " "reproducibility between experiments.",
	)


	def _add_dynamic_corpus_opts(parser, build_vocab_only=False):
	"""Options related to training corpus, type: a list of dictionary."""
	group = parser.add_argument_group("Data")
	group.add(
	"-data",
	"--data",
	required=True,
	help="List of datasets and their specifications. "
	"See examples/*.yaml for further details.",
	)
	group.add(
	"-skip_empty_level",
	"--skip_empty_level",
	default="warning",
	choices=["silent", "warning", "error"],
	help="Security level when encounter empty examples."
	"silent: silently ignore/skip empty example;"
	"warning: warning when ignore/skip empty example;"
	"error: raise error & stop execution when encouter empty.",
	)
	group.add(
	"-transforms",
	"--transforms",
	default=[],
	nargs="+",
	choices=AVAILABLE_TRANSFORMS.keys(),
	help="Default transform pipeline to apply to data. "
	"Can be specified in each corpus of data to override.",
	)

	group.add(
	"-save_data",
	"--save_data",
	required=build_vocab_only,
	help="Output base path for objects that will "
	"be saved (vocab, transforms, embeddings, ...).",
	)
	group.add(
	"-overwrite",
	"--overwrite",
	action="store_true",
	help="Overwrite existing objects if any.",
	)
	group.add(
	"-n_sample",
	"--n_sample",
	type=int,
	default=(5000 if build_vocab_only else 0),
	help=("Build vocab using " if build_vocab_only else "Stop after save ")
	+ "this number of transformed samples/corpus. Can be [-1, 0, N>0]. "
	"Set to -1 to go full corpus, 0 to skip.",
	)

	if not build_vocab_only:
	group.add(
	"-dump_transforms",
	"--dump_transforms",
	action="store_true",
	help="Dump transforms `*.transforms.pt` to disk."
	" -save_data should be set as saving prefix.",
	)
	else:
	group.add(
	"-dump_samples",
	"--dump_samples",
	action="store_true",
	help="Dump samples when building vocab. "
	"Warning: this may slow down the process.",
	)
	group.add(
	"-num_threads",
	"--num_threads",
	type=int,
	default=1,
	help="Number of parallel threads to build the vocab.",
	)
	group.add(
	"-learn_subwords",
	"--learn_subwords",
	action="store_true",
	help="Learn subwords prior to building vocab",
	)
	group.add(
	"-learn_subwords_size",
	"--learn_subwords_size",
	type=int,
	default=32000,
	help="Learn subwords operations",
	)
	group.add(
	"-vocab_sample_queue_size",
	"--vocab_sample_queue_size",
	type=int,
	default=20,
	help="Size of queues used in the build_vocab dump path.",
	)


	def _add_features_opts(parser):
	group = parser.add_argument_group("Features")
	group.add(
	"-n_src_feats",
	"--n_src_feats",
	type=int,
	default=0,
	help="Number of source feats.",
	)
	group.add(
	"-src_feats_defaults",
	"--src_feats_defaults",
	help="Default features to apply in source in case " "there are not annotated",
	)


	def _add_dynamic_vocab_opts(parser, build_vocab_only=False):
	"""Options related to vocabulary and features.

	Add all options relate to vocabulary or features to parser.
	"""
	group = parser.add_argument_group("Vocab")
	group.add(
	"-src_vocab",
	"--src_vocab",
	required=True,
	help=("Path to save" if build_vocab_only else "Path to")
	+ " src (or shared) vocabulary file. "
	"Format: one <word> or <word>\t<count> per line.",
	)
	group.add(
	"-tgt_vocab",
	"--tgt_vocab",
	help=("Path to save" if build_vocab_only else "Path to")
	+ " tgt vocabulary file. "
	"Format: one <word> or <word>\t<count> per line.",
	)
	group.add(
	"-share_vocab",
	"--share_vocab",
	action="store_true",
	help="Share source and target vocabulary.",
	)
	group.add(
	"--decoder_start_token",
	"-decoder_start_token",
	type=str,
	default=DefaultTokens.BOS,
	help="Default decoder start token "
	"for most ONMT models it is <s> = BOS "
	"it happens that for some Fairseq model it requires </s> ",
	)
	group.add(
	"--default_specials",
	"-default_specials",
	nargs="+",
	type=str,
	default=[
	DefaultTokens.UNK,
	DefaultTokens.PAD,
	DefaultTokens.BOS,
	DefaultTokens.EOS,
	],
	help="default specials used for Vocab initialization"
	" UNK, PAD, BOS, EOS will take IDs 0, 1, 2, 3 "
	" typically <unk> <blank> <s> </s> ",
	)

	_add_features_opts(parser)

	if not build_vocab_only:
	group.add(
	"-src_vocab_size",
	"--src_vocab_size",
	type=int,
	default=32768,
	help="Maximum size of the source vocabulary.",
	)
	group.add(
	"-tgt_vocab_size",
	"--tgt_vocab_size",
	type=int,
	default=32768,
	help="Maximum size of the target vocabulary",
	)
	group.add(
	"-vocab_size_multiple",
	"--vocab_size_multiple",
	type=int,
	default=8,
	help="Make the vocabulary size a multiple of this value.",
	)

	group.add(
	"-src_words_min_frequency",
	"--src_words_min_frequency",
	type=int,
	default=0,
	help="Discard source words with lower frequency.",
	)
	group.add(
	"-tgt_words_min_frequency",
	"--tgt_words_min_frequency",
	type=int,
	default=0,
	help="Discard target words with lower frequency.",
	)

	# Truncation options, for text corpus
	group = parser.add_argument_group("Pruning")
	group.add(
	"--src_seq_length_trunc",
	"-src_seq_length_trunc",
	type=int,
	default=None,
	help="Truncate source sequence length.",
	)
	group.add(
	"--tgt_seq_length_trunc",
	"-tgt_seq_length_trunc",
	type=int,
	default=None,
	help="Truncate target sequence length.",
	)

	group = parser.add_argument_group("Embeddings")
	group.add(
	"-both_embeddings",
	"--both_embeddings",
	help="Path to the embeddings file to use "
	"for both source and target tokens.",
	)
	group.add(
	"-src_embeddings",
	"--src_embeddings",
	help="Path to the embeddings file to use for source tokens.",
	)
	group.add(
	"-tgt_embeddings",
	"--tgt_embeddings",
	help="Path to the embeddings file to use for target tokens.",
	)
	group.add(
	"-embeddings_type",
	"--embeddings_type",
	choices=["GloVe", "word2vec"],
	help="Type of embeddings file.",
	)


	def _add_dynamic_transform_opts(parser):
	"""Options related to transforms.

	Options that specified in the definitions of each transform class
	at `onmt/transforms/*.py`.
	"""
	for name, transform_cls in AVAILABLE_TRANSFORMS.items():
	transform_cls.add_options(parser)


	def dynamic_prepare_opts(parser, build_vocab_only=False):
	"""Options related to data prepare in dynamic mode.

	Add all dynamic data prepare related options to parser.
	If `build_vocab_only` set to True, then only contains options that
	will be used in `onmt/bin/build_vocab.py`.
	"""
	config_opts(parser)
	_add_dynamic_corpus_opts(parser, build_vocab_only=build_vocab_only)
	_add_dynamic_vocab_opts(parser, build_vocab_only=build_vocab_only)
	_add_dynamic_transform_opts(parser)

	if build_vocab_only:
	_add_reproducibility_opts(parser)
	# as for False, this will be added in _add_train_general_opts


	def distributed_opts(parser):
	# GPU
	group = parser.add_argument_group("Distributed")
	group.add(
	"--gpu_ranks",
	"-gpu_ranks",
	default=[],
	nargs="*",
	type=int,
	help="list of ranks of each process.",
	)
	group.add(
	"--world_size",
	"-world_size",
	default=1,
	type=int,
	help="total number of distributed processes.",
	)
	group.add(
	"--parallel_mode",
	"-parallel_mode",
	default="data_parallel",
	choices=["tensor_parallel", "data_parallel"],
	type=str,
	help="Distributed mode.",
	)
	group.add(
	"--gpu_backend",
	"-gpu_backend",
	default="nccl",
	type=str,
	help="Type of torch distributed backend",
	)
	group.add(
	"--gpu_verbose_level",
	"-gpu_verbose_level",
	default=0,
	type=int,
	help="Gives more info on each process per GPU.",
	)
	group.add(
	"--master_ip",
	"-master_ip",
	default="localhost",
	type=str,
	help="IP of master for torch.distributed training.",
	)
	group.add(
	"--master_port",
	"-master_port",
	default=10000,
	type=int,
	help="Port of master for torch.distributed training.",
	)


	def model_opts(parser):
	"""
	These options are passed to the construction of the model.
	Be careful with these as they will be used during translation.
	"""

	# Embedding Options
	group = parser.add_argument_group("Model-Embeddings")
	group.add(
	"--src_word_vec_size",
	"-src_word_vec_size",
	type=int,
	default=500,
	help="Word embedding size for src.",
	)
	group.add(
	"--tgt_word_vec_size",
	"-tgt_word_vec_size",
	type=int,
	default=500,
	help="Word embedding size for tgt.",
	)
	group.add(
	"--word_vec_size",
	"-word_vec_size",
	type=int,
	default=-1,
	help="Word embedding size for src and tgt.",
	)

	group.add(
	"--share_decoder_embeddings",
	"-share_decoder_embeddings",
	action="store_true",
	help="Use a shared weight matrix for the input and "
	"output word embeddings in the decoder.",
	)
	group.add(
	"--share_embeddings",
	"-share_embeddings",
	action="store_true",
	help="Share the word embeddings between encoder "
	"and decoder. Need to use shared dictionary for this "
	"option.",
	)
	group.add(
	"--position_encoding",
	"-position_encoding",
	action="store_true",
	help="Use a sin to mark relative words positions. "
	"Necessary for non-RNN style models.",
	)
	group.add(
	"--position_encoding_type",
	"-position_encoding_type",
	type=str,
	default="SinusoidalInterleaved",
	choices=["SinusoidalInterleaved", "SinusoidalConcat"],
	help="Type of positional encoding. At the moment: "
	"Sinusoidal fixed, Interleaved or Concat",
	)

	group.add(
	"-update_vocab",
	"--update_vocab",
	action="store_true",
	help="Update source and target existing vocabularies",
	)

	group = parser.add_argument_group("Model-Embedding Features")
	group.add(
	"--feat_merge",
	"-feat_merge",
	type=str,
	default="concat",
	choices=["concat", "sum", "mlp"],
	help="Merge action for incorporating features embeddings. "
	"Options [concat\|sum\|mlp].",
	)
	group.add(
	"--feat_vec_size",
	"-feat_vec_size",
	type=int,
	default=-1,
	help="If specified, feature embedding sizes "
	"will be set to this. Otherwise, feat_vec_exponent "
	"will be used.",
	)
	group.add(
	"--feat_vec_exponent",
	"-feat_vec_exponent",
	type=float,
	default=0.7,
	help="If -feat_merge_size is not set, feature "
	"embedding sizes will be set to N^feat_vec_exponent "
	"where N is the number of values the feature takes.",
	)

	# Model Task Options
	group = parser.add_argument_group("Model- Task")
	group.add(
	"-model_task",
	"--model_task",
	default=ModelTask.SEQ2SEQ,
	choices=[ModelTask.SEQ2SEQ, ModelTask.LANGUAGE_MODEL],
	help="Type of task for the model either seq2seq or lm",
	)

	# Encoder-Decoder Options
	group = parser.add_argument_group("Model- Encoder-Decoder")
	group.add(
	"--model_type",
	"-model_type",
	default="text",
	choices=["text"],
	help="Type of source model to use. Allows "
	"the system to incorporate non-text inputs. "
	"Options are [text].",
	)
	group.add(
	"--model_dtype",
	"-model_dtype",
	default="fp32",
	choices=["fp32", "fp16"],
	help="Data type of the model.",
	)

	group.add(
	"--encoder_type",
	"-encoder_type",
	type=str,
	default="rnn",
	help="Type of encoder layer to use. Non-RNN layers "
	"are experimental. Default options are "
	"[rnn\|brnn\|ggnn\|mean\|transformer\|cnn\|transformer_lm].",
	)
	group.add(
	"--decoder_type",
	"-decoder_type",
	type=str,
	default="rnn",
	help="Type of decoder layer to use. Non-RNN layers "
	"are experimental. Default options are "
	"[rnn\|transformer\|cnn\|transformer].",
	)

	# Freeze Encoder and/or Decoder
	group.add(
	"--freeze_encoder",
	"-freeze_encoder",
	action="store_true",
	help="Freeze parameters in encoder.",
	)
	group.add(
	"--freeze_decoder",
	"-freeze_decoder",
	action="store_true",
	help="Freeze parameters in decoder.",
	)

	group.add(
	"--layers", "-layers", type=int, default=-1, help="Number of layers in enc/dec."
	)
	group.add(
	"--enc_layers",
	"-enc_layers",
	type=int,
	default=2,
	help="Number of layers in the encoder",
	)
	group.add(
	"--dec_layers",
	"-dec_layers",
	type=int,
	default=2,
	help="Number of layers in the decoder",
	)
	group.add(
	"--hidden_size",
	"-hidden_size",
	type=int,
	default=-1,
	help="Size of rnn hidden states. Overwrites " "enc_hid_size and dec_hid_size",
	)
	group.add(
	"--enc_hid_size",
	"-enc_hid_size",
	type=int,
	default=500,
	help="Size of encoder rnn hidden states.",
	)
	group.add(
	"--dec_hid_size",
	"-dec_hid_size",
	type=int,
	default=500,
	help="Size of decoder rnn hidden states.",
	)
	group.add(
	"--cnn_kernel_width",
	"-cnn_kernel_width",
	type=int,
	default=3,
	help="Size of windows in the cnn, the kernel_size is "
	"(cnn_kernel_width, 1) in conv layer",
	)

	group.add(
	"--layer_norm",
	"-layer_norm",
	type=str,
	default="standard",
	choices=["standard", "rms"],
	help="The type of layer"
	" normalization in the transformer architecture. Choices are"
	" standard or rms. Default to standard",
	)
	group.add(
	"--norm_eps", "-norm_eps", type=float, default=1e-6, help="Layer norm epsilon"
	)

	group.add(
	"--pos_ffn_activation_fn",
	"-pos_ffn_activation_fn",
	type=str,
	default=ActivationFunction.relu,
	choices=ACTIVATION_FUNCTIONS.keys(),
	help="The activation"
	" function to use in PositionwiseFeedForward layer. Choices are"
	f" {ACTIVATION_FUNCTIONS.keys()}. Default to"
	f" {ActivationFunction.relu}.",
	)

	group.add(
	"--input_feed",
	"-input_feed",
	type=int,
	default=1,
	help="Feed the context vector at each time step as "
	"additional input (via concatenation with the word "
	"embeddings) to the decoder.",
	)
	group.add(
	"--bridge",
	"-bridge",
	action="store_true",
	help="Have an additional layer between the last encoder "
	"state and the first decoder state",
	)
	group.add(
	"--rnn_type",
	"-rnn_type",
	type=str,
	default="LSTM",
	choices=["LSTM", "GRU", "SRU"],
	action=CheckSRU,
	help="The gate type to use in the RNNs",
	)
	group.add(
	"--context_gate",
	"-context_gate",
	type=str,
	default=None,
	choices=["source", "target", "both"],
	help="Type of context gate to use. " "Do not select for no context gate.",
	)

	# The following options (bridge_extra_node to n_steps) are used
	# for training with --encoder_type ggnn (Gated Graph Neural Network).
	group.add(
	"--bridge_extra_node",
	"-bridge_extra_node",
	type=bool,
	default=True,
	help="Graph encoder bridges only extra node to decoder as input",
	)
	group.add(
	"--bidir_edges",
	"-bidir_edges",
	type=bool,
	default=True,
	help="Graph encoder autogenerates bidirectional edges",
	)
	group.add(
	"--state_dim",
	"-state_dim",
	type=int,
	default=512,
	help="Number of state dimensions in the graph encoder",
	)
	group.add(
	"--n_edge_types",
	"-n_edge_types",
	type=int,
	default=2,
	help="Number of edge types in the graph encoder",
	)
	group.add(
	"--n_node",
	"-n_node",
	type=int,
	default=2,
	help="Number of nodes in the graph encoder",
	)
	group.add(
	"--n_steps",
	"-n_steps",
	type=int,
	default=2,
	help="Number of steps to advance graph encoder",
	)
	group.add(
	"--src_ggnn_size",
	"-src_ggnn_size",
	type=int,
	default=0,
	help="Vocab size plus feature space for embedding input",
	)

	# Attention options
	group = parser.add_argument_group("Model- Attention")
	group.add(
	"--global_attention",
	"-global_attention",
	type=str,
	default="general",
	choices=["dot", "general", "mlp", "none"],
	help="The attention type to use: "
	"dotprod or general (Luong) or MLP (Bahdanau)",
	)
	group.add(
	"--global_attention_function",
	"-global_attention_function",
	type=str,
	default="softmax",
	choices=["softmax", "sparsemax"],
	)
	group.add(
	"--self_attn_type",
	"-self_attn_type",
	type=str,
	default="scaled-dot",
	help="Self attention type in Transformer decoder "
	'layer -- currently "scaled-dot" or "average" ',
	)
	group.add(
	"--max_relative_positions",
	"-max_relative_positions",
	type=int,
	default=0,
	help="This setting enable relative position encoding"
	"We support two types of encodings:"
	"set this -1 to enable Rotary Embeddings"
	"more info: https://arxiv.org/abs/2104.09864"
	"set this to > 0 (ex: 16, 32) to use"
	"Maximum distance between inputs in relative "
	"positions representations. "
	"more info: https://arxiv.org/pdf/1803.02155.pdf",
	)
	group.add(
	"--relative_positions_buckets",
	"-relative_positions_buckets",
	type=int,
	default=0,
	help="This setting enable relative position bias"
	"more info: https://github.com/google-research/text-to-text-transfer-transformer",
	)
	group.add(
	"--heads",
	"-heads",
	type=int,
	default=8,
	help="Number of heads for transformer self-attention",
	)
	group.add(
	"--transformer_ff",
	"-transformer_ff",
	type=int,
	default=2048,
	help="Size of hidden transformer feed-forward",
	)
	group.add(
	"--aan_useffn",
	"-aan_useffn",
	action="store_true",
	help="Turn on the FFN layer in the AAN decoder",
	)
	group.add(
	"--add_qkvbias",
	"-add_qkvbias",
	action="store_true",
	help="Add bias to nn.linear of Query/Key/Value in MHA"
	"Note: this will add bias to output proj layer too",
	)
	group.add(
	"--multiquery",
	"-multiquery",
	action="store_true",
	help="Use MultiQuery attention" "Note: https://arxiv.org/pdf/1911.02150.pdf",
	)
	group.add(
	"--num_kv",
	"-num_kv",
	type=int,
	default=0,
	help="Number of heads for KV in the variant of MultiQuery attention (egs: Falcon 40B)",
	)
	group.add(
	"--add_ffnbias",
	"-add_ffnbias",
	action="store_true",
	help="Add bias to nn.linear of Position_wise FFN",
	)
	group.add(
	"--parallel_residual",
	"-parallel_residual",
	action="store_true",
	help="Use Parallel residual in Decoder Layer"
	"Note: this is used by GPT-J / Falcon Architecture",
	)
	group.add(
	"--shared_layer_norm",
	"-shared_layer_norm",
	action="store_true",
	help="Use a shared layer_norm in parallel residual attention"
	"Note: must be true for Falcon 7B / false for Falcon 40B",
	)
	# Alignement options
	group = parser.add_argument_group("Model - Alignement")
	group.add(
	"--lambda_align",
	"-lambda_align",
	type=float,
	default=0.0,
	help="Lambda value for alignement loss of Garg et al (2019)"
	"For more detailed information, see: "
	"https://arxiv.org/abs/1909.02074",
	)
	group.add(
	"--alignment_layer",
	"-alignment_layer",
	type=int,
	default=-3,
	help="Layer number which has to be supervised.",
	)
	group.add(
	"--alignment_heads",
	"-alignment_heads",
	type=int,
	default=0,
	help="N. of cross attention heads per layer to supervised with",
	)
	group.add(
	"--full_context_alignment",
	"-full_context_alignment",
	action="store_true",
	help="Whether alignment is conditioned on full target context.",
	)

	# Generator and loss options.
	group = parser.add_argument_group("Generator")
	group.add(
	"--copy_attn",
	"-copy_attn",
	action="store_true",
	help="Train copy attention layer.",
	)
	group.add(
	"--copy_attn_type",
	"-copy_attn_type",
	type=str,
	default=None,
	choices=["dot", "general", "mlp", "none"],
	help="The copy attention type to use. Leave as None to use "
	"the same as -global_attention.",
	)
	group.add(
	"--generator_function",
	"-generator_function",
	default="softmax",
	choices=["softmax", "sparsemax"],
	help="Which function to use for generating "
	"probabilities over the target vocabulary (choices: "
	"softmax, sparsemax)",
	)
	group.add(
	"--copy_attn_force",
	"-copy_attn_force",
	action="store_true",
	help="When available, train to copy.",
	)
	group.add(
	"--reuse_copy_attn",
	"-reuse_copy_attn",
	action="store_true",
	help="Reuse standard attention for copy",
	)
	group.add(
	"--copy_loss_by_seqlength",
	"-copy_loss_by_seqlength",
	action="store_true",
	help="Divide copy loss by length of sequence",
	)
	group.add(
	"--coverage_attn",
	"-coverage_attn",
	action="store_true",
	help="Train a coverage attention layer.",
	)
	group.add(
	"--lambda_coverage",
	"-lambda_coverage",
	type=float,
	default=0.0,
	help="Lambda value for coverage loss of See et al (2017)",
	)
	group.add(
	"--lm_prior_model",
	"-lm_prior_model",
	type=str,
	default=None,
	help="LM model to used to train the TM",
	)
	group.add(
	"--lm_prior_lambda",
	"-lambda_prior_lambda",
	type=float,
	default=0.0,
	help="LM Prior Lambda",
	)
	group.add(
	"--lm_prior_tau",
	"-lambda_prior_tau",
	type=float,
	default=1.0,
	help="LM Prior Tau",
	)
	group.add(
	"--loss_scale",
	"-loss_scale",
	type=float,
	default=0,
	help="For FP16 training, the static loss scale to use. If not "
	"set, the loss scale is dynamically computed.",
	)
	group.add(
	"--apex_opt_level",
	"-apex_opt_level",
	type=str,
	default="",
	choices=["", "O0", "O1", "O2", "O3"],
	help="For FP16 training, the opt_level to use."
	"See https://nvidia.github.io/apex/amp.html#opt-levels.",
	)
	group.add(
	"--zero_out_prompt_loss",
	"-zero_out_prompt_loss",
	action="store_true",
	help="Set the prompt loss to zero."
	"Mostly for LLM finetuning."
	"Will be enabled only if the `insert_mask_before_placeholder` transform is applied",
	)
	group.add(
	"--use_ckpting",
	"-use_ckpting",
	default=[],
	nargs="+",
	choices=["ffn", "mha", "lora"],
	type=str,
	help="use gradient checkpointing those modules",
	)


	def _add_train_general_opts(parser):
	"""General options for training"""
	group = parser.add_argument_group("General")
	group.add(
	"--data_type",
	"-data_type",
	default="text",
	help="Type of the source input. " "Options are [text].",
	)

	group.add(
	"--save_model",
	"-save_model",
	default="model",
	help="Model filename (the model will be saved as "
	"<save_model>_N.pt where N is the number "
	"of steps",
	)

	group.add(
	"--save_format",
	"-save_format",
	default="pytorch",
	choices=["pytorch", "safetensors"],
	help="Format to save the model weights",
	)

	group.add(
	"--save_checkpoint_steps",
	"-save_checkpoint_steps",
	type=int,
	default=5000,
	help="""Save a checkpoint every X steps""",
	)
	group.add(
	"--keep_checkpoint",
	"-keep_checkpoint",
	type=int,
	default=-1,
	help="Keep X checkpoints (negative: keep all)",
	)

	# LoRa
	group.add(
	"--lora_layers",
	"-lora_layers",
	default=[],
	nargs="+",
	type=str,
	help="list of layers to be replaced by LoRa layers."
	" ex: ['linear_values', 'linear_query'] "
	" cf paper §4.2 https://arxiv.org/abs/2106.09685",
	)
	group.add(
	"--lora_embedding",
	"-lora_embedding",
	action="store_true",
	help="replace embeddings with LoRa Embeddings see §5.1",
	)
	group.add(
	"--lora_rank",
	"-lora_rank",
	type=int,
	default=2,
	help="r=2 successfully tested with NLLB-200 3.3B",
	)
	group.add(
	"--lora_alpha",
	"-lora_alpha",
	type=int,
	default=1,
	help="§4.1 https://arxiv.org/abs/2106.09685",
	)
	group.add(
	"--lora_dropout",
	"-lora_dropout",
	type=float,
	default=0.0,
	help="rule of thumb: same value as in main model",
	)

	_add_reproducibility_opts(parser)

	# Init options
	group = parser.add_argument_group("Initialization")
	group.add(
	"--param_init",
	"-param_init",
	type=float,
	default=0.1,
	help="Parameters are initialized over uniform distribution "
	"with support (-param_init, param_init). "
	"Use 0 to not use initialization",
	)
	group.add(
	"--param_init_glorot",
	"-param_init_glorot",
	action="store_true",
	help="Init parameters with xavier_uniform. " "Required for transformer.",
	)

	group.add(
	"--train_from",
	"-train_from",
	default="",
	type=str,
	help="If training from a checkpoint then this is the "
	"path to the pretrained model's state_dict.",
	)
	group.add(
	"--reset_optim",
	"-reset_optim",
	default="none",
	choices=["none", "all", "states", "keep_states"],
	help="Optimization resetter when train_from.",
	)

	# Pretrained word vectors
	group.add(
	"--pre_word_vecs_enc",
	"-pre_word_vecs_enc",
	help="If a valid path is specified, then this will load "
	"pretrained word embeddings on the encoder side. "
	"See README for specific formatting instructions.",
	)
	group.add(
	"--pre_word_vecs_dec",
	"-pre_word_vecs_dec",
	help="If a valid path is specified, then this will load "
	"pretrained word embeddings on the decoder side. "
	"See README for specific formatting instructions.",
	)
	# Freeze word vectors
	group.add(
	"--freeze_word_vecs_enc",
	"-freeze_word_vecs_enc",
	action="store_true",
	help="Freeze word embeddings on the encoder side.",
	)
	group.add(
	"--freeze_word_vecs_dec",
	"-freeze_word_vecs_dec",
	action="store_true",
	help="Freeze word embeddings on the decoder side.",
	)

	# Optimization options
	group = parser.add_argument_group("Optimization- Type")
	group.add(
	"--num_workers",
	"-num_workers",
	type=int,
	default=2,
	help="pytorch DataLoader num_workers",
	)
	group.add(
	"--batch_size",
	"-batch_size",
	type=int,
	default=64,
	help="Maximum batch size for training",
	)
	group.add(
	"--batch_size_multiple",
	"-batch_size_multiple",
	type=int,
	default=1,
	help="Batch size multiple for token batches.",
	)
	group.add(
	"--batch_type",
	"-batch_type",
	default="sents",
	choices=["sents", "tokens"],
	help="Batch grouping for batch_size. Standard "
	"is sents. Tokens will do dynamic batching",
	)
	group.add(
	"--normalization",
	"-normalization",
	default="sents",
	choices=["sents", "tokens"],
	help="Normalization method of the gradient.",
	)
	group.add(
	"--accum_count",
	"-accum_count",
	type=int,
	nargs="+",
	default=[1],
	help="Accumulate gradient this many times. "
	"Approximately equivalent to updating "
	"batch_size * accum_count batches at once. "
	"Recommended for Transformer.",
	)
	group.add(
	"--accum_steps",
	"-accum_steps",
	type=int,
	nargs="+",
	default=[0],
	help="Steps at which accum_count values change",
	)
	group.add(
	"--valid_steps",
	"-valid_steps",
	type=int,
	default=10000,
	help="Perfom validation every X steps",
	)
	group.add(
	"--valid_batch_size",
	"-valid_batch_size",
	type=int,
	default=32,
	help="Maximum batch size for validation",
	)
	group.add(
	"--train_steps",
	"-train_steps",
	type=int,
	default=100000,
	help="Number of training steps",
	)
	group.add(
	"--single_pass",
	"-single_pass",
	action="store_true",
	help="Make a single pass over the training dataset.",
	)
	group.add(
	"--early_stopping",
	"-early_stopping",
	type=int,
	default=0,
	help="Number of validation steps without improving.",
	)
	group.add(
	"--early_stopping_criteria",
	"-early_stopping_criteria",
	nargs="*",
	default=None,
	help="Criteria to use for early stopping.",
	)
	group.add(
	"--optim",
	"-optim",
	default="sgd",
	choices=[
	"sgd",
	"adagrad",
	"adadelta",
	"adam",
	"sparseadam",
	"adafactor",
	"fusedadam",
	"adamw8bit",
	"pagedadamw8bit",
	"pagedadamw32bit",
	],
	help="Optimization method.",
	)
	group.add(
	"--adagrad_accumulator_init",
	"-adagrad_accumulator_init",
	type=float,
	default=0,
	help="Initializes the accumulator values in adagrad. "
	"Mirrors the initial_accumulator_value option "
	"in the tensorflow adagrad (use 0.1 for their default).",
	)
	group.add(
	"--max_grad_norm",
	"-max_grad_norm",
	type=float,
	default=5,
	help="If the norm of the gradient vector exceeds this, "
	"renormalize it to have the norm equal to "
	"max_grad_norm",
	)
	group.add(
	"--dropout",
	"-dropout",
	type=float,
	default=[0.3],
	nargs="+",
	help="Dropout probability; applied in LSTM stacks.",
	)
	group.add(
	"--attention_dropout",
	"-attention_dropout",
	type=float,
	default=[0.1],
	nargs="+",
	help="Attention Dropout probability.",
	)
	group.add(
	"--dropout_steps",
	"-dropout_steps",
	type=int,
	nargs="+",
	default=[0],
	help="Steps at which dropout changes.",
	)
	group.add(
	"--truncated_decoder",
	"-truncated_decoder",
	type=int,
	default=0,
	help="""Truncated bptt.""",
	)
	group.add(
	"--adam_beta1",
	"-adam_beta1",
	type=float,
	default=0.9,
	help="The beta1 parameter used by Adam. "
	"Almost without exception a value of 0.9 is used in "
	"the literature, seemingly giving good results, "
	"so we would discourage changing this value from "
	"the default without due consideration.",
	)
	group.add(
	"--adam_beta2",
	"-adam_beta2",
	type=float,
	default=0.999,
	help="The beta2 parameter used by Adam. "
	"Typically a value of 0.999 is recommended, as this is "
	"the value suggested by the original paper describing "
	"Adam, and is also the value adopted in other frameworks "
	"such as Tensorflow and Keras, i.e. see: "
	"https://www.tensorflow.org/api_docs/python/tf/train/Adam"
	"Optimizer or https://keras.io/optimizers/ . "
	'Whereas recently the paper "Attention is All You Need" '
	"suggested a value of 0.98 for beta2, this parameter may "
	"not work well for normal models / default "
	"baselines.",
	)
	group.add(
	"--label_smoothing",
	"-label_smoothing",
	type=float,
	default=0.0,
	help="Label smoothing value epsilon. "
	"Probabilities of all non-true labels "
	"will be smoothed by epsilon / (vocab_size - 1). "
	"Set to zero to turn off label smoothing. "
	"For more detailed information, see: "
	"https://arxiv.org/abs/1512.00567",
	)
	group.add(
	"--average_decay",
	"-average_decay",
	type=float,
	default=0,
	help="Moving average decay. "
	"Set to other than 0 (e.g. 1e-4) to activate. "
	"Similar to Marian NMT implementation: "
	"http://www.aclweb.org/anthology/P18-4020 "
	"For more detail on Exponential Moving Average: "
	"https://en.wikipedia.org/wiki/Moving_average",
	)
	group.add(
	"--average_every",
	"-average_every",
	type=int,
	default=1,
	help="Step for moving average. "
	"Default is every update, "
	"if -average_decay is set.",
	)

	# learning rate
	group = parser.add_argument_group("Optimization- Rate")
	group.add(
	"--learning_rate",
	"-learning_rate",
	type=float,
	default=1.0,
	help="Starting learning rate. "
	"Recommended settings: sgd = 1, adagrad = 0.1, "
	"adadelta = 1, adam = 0.001",
	)
	group.add(
	"--learning_rate_decay",
	"-learning_rate_decay",
	type=float,
	default=0.5,
	help="If update_learning_rate, decay learning rate by "
	"this much if steps have gone past "
	"start_decay_steps",
	)
	group.add(
	"--start_decay_steps",
	"-start_decay_steps",
	type=int,
	default=50000,
	help="Start decaying every decay_steps after " "start_decay_steps",
	)
	group.add(
	"--decay_steps",
	"-decay_steps",
	type=int,
	default=10000,
	help="Decay every decay_steps",
	)

	group.add(
	"--decay_method",
	"-decay_method",
	type=str,
	default="none",
	choices=["noam", "noamwd", "rsqrt", "none"],
	help="Use a custom decay rate.",
	)
	group.add(
	"--warmup_steps",
	"-warmup_steps",
	type=int,
	default=4000,
	help="Number of warmup steps for custom decay.",
	)
	_add_logging_opts(parser, is_train=True)


	def _add_train_dynamic_data(parser):
	group = parser.add_argument_group("Dynamic data")
	group.add(
	"-bucket_size",
	"--bucket_size",
	type=int,
	default=262144,
	help="""A bucket is a buffer of bucket_size examples to pick
	from the various Corpora. The dynamic iterator batches
	batch_size batchs from the bucket and shuffle them.""",
	)
	group.add(
	"-bucket_size_init",
	"--bucket_size_init",
	type=int,
	default=-1,
	help="""The bucket is initalized with this awith this
	amount of examples (optional)""",
	)
	group.add(
	"-bucket_size_increment",
	"--bucket_size_increment",
	type=int,
	default=0,
	help="""The bucket size is incremented with this
	amount of examples (optional)""",
	)
	group.add(
	"-prefetch_factor",
	"--prefetch_factor",
	type=int,
	default=200,
	help="""number of mini-batches loaded in advance to avoid the
	GPU waiting during the refilling of the bucket.""",
	)


	def _add_quant_opts(parser):
	group = parser.add_argument_group("Quant options")
	group.add(
	"--quant_layers",
	"-quant_layers",
	default=[],
	nargs="+",
	type=str,
	help="list of layers to be compressed in 4/8bit.",
	)

	group.add(
	"--quant_type",
	"-quant_type",
	default="bnb_8bit",
	choices=["bnb_8bit", "bnb_FP4", "bnb_NF4"],
	type=str,
	help="Type of compression.",
	)


	def train_opts(parser):
	"""All options used in train."""
	# options relate to data preprare
	dynamic_prepare_opts(parser, build_vocab_only=False)
	distributed_opts(parser)
	# options relate to train
	model_opts(parser)
	_add_train_general_opts(parser)
	_add_train_dynamic_data(parser)
	_add_quant_opts(parser)


	def _add_decoding_opts(parser):
	group = parser.add_argument_group("Beam Search")
	beam_size = group.add(
	"--beam_size", "-beam_size", type=int, default=5, help="Beam size"
	)
	group.add(
	"--ratio",
	"-ratio",
	type=float,
	default=-0.0,
	help="Ratio based beam stop condition",
	)

	group = parser.add_argument_group("Random Sampling")
	group.add(
	"--random_sampling_topk",
	"-random_sampling_topk",
	default=0,
	type=int,
	help="Set this to -1 to do random sampling from full "
	"distribution. Set this to value k>1 to do random "
	"sampling restricted to the k most likely next tokens. "
	"Set this to 1 to use argmax.",
	)
	group.add(
	"--random_sampling_topp",
	"-random_sampling_topp",
	default=0.0,
	type=float,
	help="Probability for top-p/nucleus sampling. Restrict tokens"
	" to the most likely until the cumulated probability is"
	" over p. In range [0, 1]."
	" https://arxiv.org/abs/1904.09751",
	)
	group.add(
	"--random_sampling_temp",
	"-random_sampling_temp",
	default=1.0,
	type=float,
	help="If doing random sampling, divide the logits by "
	"this before computing softmax during decoding.",
	)
	group._group_actions.append(beam_size)
	_add_reproducibility_opts(parser)

	group = parser.add_argument_group(
	"Penalties", ".. Note:: Coverage Penalty is not available in sampling."
	)
	# Alpha and Beta values for Google Length + Coverage penalty
	# Described here: https://arxiv.org/pdf/1609.08144.pdf, Section 7
	# Length penalty options
	group.add(
	"--length_penalty",
	"-length_penalty",
	default="avg",
	choices=["none", "wu", "avg"],
	help="Length Penalty to use.",
	)
	group.add(
	"--alpha",
	"-alpha",
	type=float,
	default=1.0,
	help="Length penalty parameter" "(higher = longer generation)",
	)
	# Coverage penalty options
	group.add(
	"--coverage_penalty",
	"-coverage_penalty",
	default="none",
	choices=["none", "wu", "summary"],
	help="Coverage Penalty to use. Only available in beam search.",
	)
	group.add(
	"--beta", "-beta", type=float, default=-0.0, help="Coverage penalty parameter"
	)
	group.add(
	"--stepwise_penalty",
	"-stepwise_penalty",
	action="store_true",
	help="Apply coverage penalty at every decoding step. "
	"Helpful for summary penalty.",
	)

	group = parser.add_argument_group(
	"Decoding tricks",
	".. Tip:: Following options can be used to limit the decoding length "
	"or content.",
	)
	# Decoding Length constraint
	group.add(
	"--min_length",
	"-min_length",
	type=int,
	default=0,
	help="Minimum prediction length",
	)
	group.add(
	"--max_length",
	"-max_length",
	type=int,
	default=250,
	help="Maximum prediction length.",
	)
	# Decoding content constraint
	group.add(
	"--block_ngram_repeat",
	"-block_ngram_repeat",
	type=int,
	default=0,
	help="Block repetition of ngrams during decoding.",
	)
	group.add(
	"--ignore_when_blocking",
	"-ignore_when_blocking",
	nargs="+",
	type=str,
	default=[],
	help="Ignore these strings when blocking repeats. "
	"You want to block sentence delimiters.",
	)
	group.add(
	"--replace_unk",
	"-replace_unk",
	action="store_true",
	help="Replace the generated UNK tokens with the "
	"source token that had highest attention weight. If "
	"phrase_table is provided, it will look up the "
	"identified source token and give the corresponding "
	"target token. If it is not provided (or the identified "
	"source token does not exist in the table), then it "
	"will copy the source token.",
	)
	group.add(
	"--ban_unk_token",
	"-ban_unk_token",
	action="store_true",
	help="Prevent unk token generation by setting unk proba to 0",
	)
	group.add(
	"--phrase_table",
	"-phrase_table",
	type=str,
	default="",
	help="If phrase_table is provided (with replace_unk), it will "
	"look up the identified source token and give the "
	"corresponding target token. If it is not provided "
	"(or the identified source token does not exist in "
	"the table), then it will copy the source token.",
	)


	def translate_opts(parser, dynamic=False):
	"""Translation / inference options"""
	group = parser.add_argument_group("Model")
	group.add(
	"--model",
	"-model",
	dest="models",
	metavar="MODEL",
	nargs="+",
	type=str,
	default=[],
	required=True,
	help="Path to model .pt file(s). "
	"Multiple models can be specified, "
	"for ensemble decoding.",
	)
	group.add(
	"--precision",
	"-precision",
	default="",
	choices=["", "fp32", "fp16", "int8"],
	help="Precision to run inference."
	"default is model.dtype"
	"fp32 to force slow FP16 model on GTX1080"
	"int8 enables pytorch native 8-bit quantization"
	"(cpu only)",
	)
	group.add(
	"--fp32",
	"-fp32",
	action=DeprecateAction,
	help="Deprecated use 'precision' instead",
	)
	group.add(
	"--int8",
	"-int8",
	action=DeprecateAction,
	help="Deprecated use 'precision' instead",
	)
	group.add(
	"--avg_raw_probs",
	"-avg_raw_probs",
	action="store_true",
	help="If this is set, during ensembling scores from "
	"different models will be combined by averaging their "
	"raw probabilities and then taking the log. Otherwise, "
	"the log probabilities will be averaged directly. "
	"Necessary for models whose output layers can assign "
	"zero probability.",
	)

	group = parser.add_argument_group("Data")
	group.add(
	"--data_type",
	"-data_type",
	default="text",
	help="Type of the source input. Options: [text].",
	)

	group.add(
	"--src",
	"-src",
	required=True,
	help="Source sequence to decode (one line per " "sequence)",
	)
	group.add("--tgt", "-tgt", help="True target sequence (optional)")
	group.add(
	"--tgt_file_prefix",
	"-tgt_file_prefix",
	action="store_true",
	help="Generate predictions using provided `-tgt` as prefix.",
	)
	group.add(
	"--output",
	"-output",
	default="pred.txt",
	help="Path to output the predictions (each line will "
	"be the decoded sequence",
	)
	group.add(
	"--report_align",
	"-report_align",
	action="store_true",
	help="Report alignment for each translation.",
	)
	group.add(
	"--gold_align",
	"-gold_align",
	action="store_true",
	help="Report alignment between source and gold target."
	"Useful to test the performance of learnt alignments.",
	)
	group.add(
	"--report_time",
	"-report_time",
	action="store_true",
	help="Report some translation time metrics",
	)

	# Adding options related to source and target features
	_add_features_opts(parser)

	# Adding options relate to decoding strategy
	_add_decoding_opts(parser)

	# Adding option for logging
	_add_logging_opts(parser, is_train=False)

	distributed_opts(parser)

	group = parser.add_argument_group("Efficiency")
	group.add("--batch_size", "-batch_size", type=int, default=30, help="Batch size")
	group.add(
	"--batch_type",
	"-batch_type",
	default="sents",
	choices=["sents", "tokens"],
	help="Batch grouping for batch_size. Standard "
	"is sents. Tokens will do dynamic batching",
	)
	group.add("--gpu", "-gpu", type=int, default=-1, help="Device to run on")

	if dynamic:
	group.add(
	"-transforms",
	"--transforms",
	default=[],
	nargs="+",
	choices=AVAILABLE_TRANSFORMS.keys(),
	help="Default transform pipeline to apply to data.",
	)

	# Adding options related to Transforms
	_add_dynamic_transform_opts(parser)

	_add_quant_opts(parser)


	# Copyright 2016 The Chromium Authors. All rights reserved.
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.


	class StoreLoggingLevelAction(configargparse.Action):
	"""Convert string to logging level"""

	import logging

	LEVELS = {
	"CRITICAL": logging.CRITICAL,
	"ERROR": logging.ERROR,
	"WARNING": logging.WARNING,
	"INFO": logging.INFO,
	"DEBUG": logging.DEBUG,
	"NOTSET": logging.NOTSET,
	}

	CHOICES = list(LEVELS.keys()) + [str(_) for _ in LEVELS.values()]

	def __init__(self, option_strings, dest, help=None, **kwargs):
	super(StoreLoggingLevelAction, self).__init__(
	option_strings, dest, help=help, **kwargs
	)

	def __call__(self, parser, namespace, value, option_string=None):
	# Get the key 'value' in the dict, or just use 'value'
	level = StoreLoggingLevelAction.LEVELS.get(value, value)
	setattr(namespace, self.dest, level)


	class DeprecateAction(configargparse.Action):
	"""Deprecate action"""

	def __init__(self, option_strings, dest, help=None, **kwargs):
	super(DeprecateAction, self).__init__(
	option_strings, dest, nargs=0, help=help, **kwargs
	)

	def __call__(self, parser, namespace, values, flag_name):
	help = self.help if self.help is not None else ""
	msg = "Flag '%s' is deprecated. %s" % (flag_name, help)
	raise configargparse.ArgumentTypeError(msg)