Upload folder using huggingface_hub

b386992 verified 7 months ago

5.75 kB

	_target_: nemo.collections.llm.gpt.model.base.GPTModel
	config:
	_cpu_offloading_context: null
	_target_: nemo.collections.llm.gpt.model.base.GPTConfig
	activation_func:
	_call_: false
	_target_: torch._C._nn.gelu
	activation_func_fp8_input_store: false
	add_bias_linear: true
	add_qkv_bias: false
	apply_query_key_layer_scaling: false
	apply_residual_connection_post_layernorm: false
	apply_rope_fusion: false
	async_tensor_model_parallel_allreduce: false
	attention_dropout: 0.1
	attention_softmax_in_fp32: false
	autocast_dtype: null
	barrier_with_L1_time: true
	batch_p2p_comm: true
	batch_p2p_sync: true
	bf16: false
	bias_activation_fusion: false
	bias_dropout_fusion: false
	calculate_per_token_loss: false
	clone_scatter_output_in_embedding: true
	config_logger_dir: ''
	context_parallel_size: 1
	cpu_offloading: false
	cpu_offloading_activations: true
	cpu_offloading_num_layers: 0
	cpu_offloading_weights: true
	cross_entropy_loss_fusion: true
	data_step_fn:
	_call_: false
	_target_: nemo.collections.llm.gpt.model.base.gpt_data_step
	deallocate_pipeline_outputs: false
	defer_embedding_wgrad_compute: false
	deterministic_mode: false
	disable_parameter_transpose_cache: false
	distribute_saved_activations: null
	enable_autocast: false
	enable_cuda_graph: false
	expert_model_parallel_size: 1
	external_cuda_graph: false
	ffn_hidden_size: 4096
	finalize_model_grads_func: null
	first_pipeline_num_layers: null
	forward_step_fn:
	_call_: false
	_target_: nemo.collections.llm.gpt.model.base.gpt_forward_step
	fp16: false
	fp16_lm_cross_entropy: false
	fp32_residual_connection: false
	fp8: null
	fp8_amax_compute_algo: most_recent
	fp8_amax_history_len: 1
	fp8_dot_product_attention: false
	fp8_interval: 1
	fp8_margin: 0
	fp8_multi_head_attention: false
	fp8_wgrad: true
	gated_linear_unit: false
	grad_scale_func: null
	grad_sync_func: null
	gradient_accumulation_fusion: true
	hidden_dropout: 0.1
	hidden_size: 1024
	init_method: null
	init_method_std: 0.02
	kv_channels: null
	last_pipeline_num_layers: null
	layernorm_epsilon: 1.0e-05
	layernorm_zero_centered_gamma: false
	make_vocab_size_divisible_by: 128
	masked_softmax_fusion: true
	memory_efficient_layer_norm: false
	moe_aux_loss_coeff: 0
	moe_expert_capacity_factor: null
	moe_extended_tp: false
	moe_grouped_gemm: false
	moe_input_jitter_eps: null
	moe_layer_recompute: false
	moe_pad_expert_input_to_capacity: false
	moe_per_layer_logging: false
	moe_router_load_balancing_type: aux_loss
	moe_router_pre_softmax: false
	moe_router_topk: 2
	moe_shared_expert_intermediate_size: null
	moe_shared_expert_overlap: false
	moe_token_dispatcher_type: allgather
	moe_token_drop_policy: probs
	moe_token_dropping: false
	moe_z_loss_coeff: null
	no_sync_func: null
	normalization: LayerNorm
	num_attention_heads: 8
	num_layers: 2
	num_microbatches_with_partial_activation_checkpoints: null
	num_moe_experts: null
	num_query_groups: null
	output_layer_init_method: null
	overlap_p2p_comm: false
	parallel_output: true
	param_sync_func: null
	params_dtype:
	_call_: false
	_target_: torch.float32
	perform_initialization: true
	persist_layer_norm: false
	pipeline_dtype: null
	pipeline_model_parallel_size: 1
	pipeline_model_parallel_split_rank: null
	position_embedding_type: learned_absolute
	qk_layernorm: false
	recompute_granularity: null
	recompute_method: null
	recompute_num_layers: null
	rotary_base: 10000
	rotary_interleaved: false
	rotary_percent: 1.0
	seq_len_interpolation_factor: null
	seq_length: 1024
	sequence_parallel: false
	share_embeddings_and_output_weights: true
	tensor_model_parallel_size: 1
	test_mode: false
	timers: null
	tp_comm_atomic_ag: false
	tp_comm_atomic_rs: false
	tp_comm_bulk_dgrad: true
	tp_comm_bulk_wgrad: true
	tp_comm_overlap: false
	tp_comm_overlap_ag: true
	tp_comm_overlap_disable_fc1: false
	tp_comm_overlap_disable_qkv: false
	tp_comm_overlap_rs: true
	tp_comm_overlap_rs_dgrad: false
	tp_comm_split_ag: true
	tp_comm_split_rs: true
	tp_only_amax_red: false
	transformer_layer_spec:
	_call_: false
	_target_: nemo.collections.llm.gpt.model.base.default_layer_spec
	use_cpu_initialization: false
	use_ring_exchange_p2p: false
	use_te_rng_tracker: false
	variable_seq_lengths: false
	virtual_pipeline_model_parallel_size: null
	wgrad_deferral_limit: 0
	window_size: null
	model_transform: null
	optim:
	_target_: nemo.lightning.pytorch.optim.megatron.MegatronOptimizerModule
	config:
	_target_: megatron.core.optimizer.optimizer_config.OptimizerConfig
	adam_beta1: 0.9
	adam_beta2: 0.999
	adam_eps: 1.0e-08
	barrier_with_L1_time: false
	bf16: false
	clip_grad: 1.0
	config_logger_dir: ''
	decoupled_lr: null
	decoupled_min_lr: null
	fp16: false
	hysteresis: 2
	initial_loss_scale: 4294967296
	log_num_zeros_in_grad: false
	loss_scale: null
	loss_scale_window: 1000
	lr: 0.0001
	min_loss_scale: 1.0
	min_lr: null
	optimizer: adam
	overlap_param_gather_with_optimizer_step: false
	params_dtype:
	_call_: false
	_target_: torch.float32
	sgd_momentum: 0.9
	timers: null
	use_distributed_optimizer: true
	weight_decay: 0.01
	lr_mult: 1.0
	lr_scheduler: null
	no_weight_decay_cond: null
	scale_lr_cond: null
	tokenizer:
	_target_: nemo.collections.common.tokenizers.huggingface.auto_tokenizer.AutoTokenizer
	bos_token: null
	cls_token: null
	eos_token: null
	mask_token: null
	merges_file: megatron-gpt-345m_merges
	pad_token: null
	pretrained_model_name: gpt2
	sep_token: null
	trust_remote_code: false
	unk_token: null
	use_fast: false
	vocab_file: megatron-gpt-345m_vocab