camenduru
/

NeMo

Model card Files Files and versions

NeMo / examples /nlp /machine_translation /conf /huggingface.yaml

camenduru's picture

thanks to NVIDIA ❤

7934b29 almost 3 years ago

history blame contribute delete

3.33 kB

	name: HuggingFaceEncoder
	do_training: True # set to False if only preprocessing data
	do_testing: False # set to True to run evaluation on test data after training

	model:
	beam_size: 4
	len_pen: 0.6
	max_generation_delta: -1
	label_smoothing: 0.1
	shared_tokenizer: false
	preproc_out_dir: null
	src_language: 'en'
	tgt_language: 'de'

	train_ds:
	src_file_name: null
	tgt_file_name: null
	use_tarred_dataset: False # if true tar_file_name and meta_file_name will be used (or created automatically)
	# config for preprocessing training data and creating a tarred datset automatically
	tar_file_prefix: parallel # prefix for tar file names
	tar_files: null # if data has already been preprocessed (rest of config ignored)
	metadata_file: null # metadata for tarred dataset
	lines_per_dataset_fragment: 1000000 # Number of lines to consider for bucketing and padding
	num_batches_per_tarfile: 100 # Number of batches (pickle files) within each tarfile
	tar_shuffle_n: 100 # How many samples to look ahead and load to be shuffled
	shard_strategy: scatter # tarred dataset shard distribution strategy
	n_preproc_jobs: -2 # number of processes to use for data preprocessing (-2 means all but 2)
	tokens_in_batch: 512
	clean: true
	max_seq_length: 512
	shuffle: true
	num_samples: -1
	drop_last: false
	pin_memory: false
	num_workers: 8

	validation_ds:
	src_file_name: null
	tgt_file_name: null
	tokens_in_batch: 512
	clean: false
	max_seq_length: 512
	shuffle: false
	num_samples: -1
	drop_last: false
	pin_memory: false
	num_workers: 8

	test_ds:
	src_file_name: null
	tgt_file_name: null
	tokens_in_batch: 512
	clean: false
	max_seq_length: 512
	shuffle: false
	num_samples: -1
	drop_last: false
	pin_memory: false
	num_workers: 8

	optim:
	name: adam
	lr: 0.001
	betas:
	- 0.9
	- 0.98
	weight_decay: 0.0
	sched:
	name: InverseSquareRootAnnealing
	min_lr: 0.0
	last_epoch: -1
	warmup_ratio: 0.1

	encoder_tokenizer:
	library: huggingface
	tokenizer_model: null
	vocab_file: null
	special_tokens: null
	vocab_size: null

	decoder_tokenizer:
	library: yttm
	tokenizer_model: null
	vocab_file: null
	special_tokens: null
	vocab_size: null

	encoder:
	library: huggingface
	model_name: bert-base-uncased
	pretrained: false

	decoder:
	library: nemo
	model_name: null
	pretrained: false
	max_sequence_length: 512
	num_token_types: 2
	embedding_dropout: 0.1
	learn_positional_encodings: false
	hidden_size: 512
	inner_size: 2048
	num_layers: 6
	num_attention_heads: 8
	ffn_dropout: 0.1
	attn_score_dropout: 0.1
	attn_layer_dropout: 0.1
	hidden_act: relu
	pre_ln: false

	head:
	num_layers: 1
	activation: relu
	log_softmax: true
	dropout: 0.0
	use_transformer_init: true

	trainer:
	devices: 4
	num_nodes: 1
	max_epochs: 200
	precision: 16 # Should be set to 16 for O1 and O2, default is 16 as PT ignores it when am_level is O0
	accelerator: gpu
	enable_checkpointing: False
	logger: False
	log_every_n_steps: 50 # Interval of logging.
	check_val_every_n_epoch: 1
	benchmark: False

	exp_manager:
	name: HuggingFaceEncoder
	files_to_copy: []