Upload folder using huggingface_hub

9cfc143 verified about 1 year ago

8.14 kB

	_target_: nemo.collections.llm.gpt.model.ssm.MambaModel
	config:
	_cpu_offloading_context: null
	_target_: nemo.collections.llm.gpt.model.ssm.NemotronHConfig8B
	account_for_embedding_in_pipeline_split: false
	account_for_loss_in_pipeline_split: false
	activation_func:
	_call_: false
	_target_: nemo.collections.llm.gpt.model.ssm.NemotronHConfigBase.<lambda>
	activation_func_fp8_input_store: false
	add_bias_linear: false
	add_qkv_bias: false
	apply_query_key_layer_scaling: false
	apply_residual_connection_post_layernorm: false
	apply_rope_fusion: true
	async_tensor_model_parallel_allreduce: false
	attention_backend:
	_call_: true
	_target_: megatron.core.transformer.enums.AttnBackend
	attention_dropout: 0.0
	attention_softmax_in_fp32: false
	autocast_dtype: null
	barrier_with_L1_time: true
	batch_p2p_comm: true
	batch_p2p_sync: true
	bf16: true
	bias_activation_fusion: false
	bias_dropout_fusion: true
	calculate_per_token_loss: false
	clone_scatter_output_in_embedding: true
	config_logger_dir: ''
	context_parallel_size: 1
	cp_comm_type: null
	cpu_offloading: false
	cpu_offloading_activations: true
	cpu_offloading_num_layers: 0
	cpu_offloading_weights: true
	cross_entropy_fusion_impl: native
	cross_entropy_loss_fusion: true
	cuda_graph_retain_backward_graph: false
	cuda_graph_scope: full
	cuda_graph_use_single_mempool: false
	cuda_graph_warmup_steps: 3
	data_step_fn:
	_call_: false
	_target_: nemo.collections.llm.gpt.model.base.gpt_data_step
	deallocate_pipeline_outputs: true
	defer_embedding_wgrad_compute: false
	deterministic_mode: false
	disable_parameter_transpose_cache: false
	distribute_saved_activations: null
	enable_autocast: false
	enable_cuda_graph: false
	expert_model_parallel_size: 1
	expert_tensor_parallel_size: null
	external_cuda_graph: false
	ffn_hidden_size: 21504
	finalize_model_grads_func: null
	first_last_layers_bf16: true
	flash_decode: false
	forward_step_fn:
	_call_: false
	_target_: nemo.collections.llm.gpt.model.ssm.ssm_forward_step
	fp16: false
	fp16_lm_cross_entropy: false
	fp32_residual_connection: false
	fp8: null
	fp8_amax_compute_algo: most_recent
	fp8_amax_history_len: 1
	fp8_dot_product_attention: false
	fp8_interval: 1
	fp8_margin: 0
	fp8_multi_head_attention: false
	fp8_recipe: delayed
	fp8_wgrad: true
	gated_linear_unit: false
	get_attention_mask_from_fusion: false
	grad_scale_func: null
	grad_sync_func: null
	gradient_accumulation_fusion: false
	hidden_dropout: 0.0
	hidden_size: 4096
	hierarchical_context_parallel_sizes: null
	hybrid_attention_ratio: 0.0
	hybrid_mlp_ratio: 0.0
	hybrid_override_pattern: M-M-M-M-M-M-M-M-M-M-M-M-M-M-M-M-M-M-M-M-M-M-M-M-
	inference_rng_tracker: false
	init_method: null
	init_method_std: 0.02
	init_model_with_meta_device: false
	is_hybrid_model: true
	kv_channels: null
	layernorm_epsilon: 1.0e-05
	layernorm_zero_centered_gamma: false
	make_vocab_size_divisible_by: 128
	mamba_head_dim: 64
	mamba_nheads: 128
	mamba_num_groups: 8
	mamba_state_dim: 128
	mapping_type: nvidia-hybrid-nemotronh
	masked_softmax_fusion: true
	memory_efficient_layer_norm: false
	microbatch_group_size_per_vp_stage: 1
	moe_aux_loss_coeff: 0
	moe_enable_deepep: false
	moe_expert_capacity_factor: null
	moe_extended_tp: false
	moe_ffn_hidden_size: null
	moe_grouped_gemm: false
	moe_input_jitter_eps: null
	moe_layer_freq: 1
	moe_layer_recompute: false
	moe_pad_expert_input_to_capacity: false
	moe_per_layer_logging: false
	moe_permute_fusion: false
	moe_router_bias_update_rate: 0.001
	moe_router_dtype: null
	moe_router_enable_expert_bias: false
	moe_router_group_topk: null
	moe_router_load_balancing_type: aux_loss
	moe_router_num_groups: null
	moe_router_pre_softmax: false
	moe_router_score_function: softmax
	moe_router_topk: 2
	moe_router_topk_limited_devices: null
	moe_router_topk_scaling_factor: null
	moe_shared_expert_intermediate_size: null
	moe_shared_expert_overlap: false
	moe_token_dispatcher_type: allgather
	moe_token_drop_policy: probs
	moe_token_dropping: false
	moe_use_legacy_grouped_gemm: false
	moe_z_loss_coeff: null
	mtp_loss_scaling_factor: null
	mtp_num_layers: null
	multi_latent_attention: false
	no_sync_func: null
	normalization: RMSNorm
	num_attention_heads: 32
	num_layers: 52
	num_layers_at_end_in_bf16: 1
	num_layers_at_start_in_bf16: 1
	num_layers_in_first_pipeline_stage: null
	num_layers_in_last_pipeline_stage: null
	num_microbatches_with_partial_activation_checkpoints: null
	num_moe_experts: null
	num_query_groups: 8
	output_layer_init_method: null
	overlap_p2p_comm: false
	overlap_p2p_comm_warmup_flush: false
	parallel_output: true
	param_sync_func: null
	params_dtype:
	_call_: false
	_target_: torch.bfloat16
	perform_initialization: true
	persist_layer_norm: true
	pipeline_dtype: null
	pipeline_model_parallel_comm_backend: null
	pipeline_model_parallel_size: 1
	pipeline_model_parallel_split_rank: null
	position_embedding_type: none
	post_process: true
	pre_process: true
	qk_layernorm: false
	recompute_granularity: null
	recompute_method: null
	recompute_num_layers: null
	rotary_base: 10000
	rotary_interleaved: false
	rotary_percent: 1.0
	seq_len_interpolation_factor: null
	seq_length: 8192
	sequence_parallel: false
	share_embeddings_and_output_weights: false
	softmax_scale: null
	tensor_model_parallel_size: 1
	test_mode: false
	timers: null
	tokenizer_library: tiktoken
	tokenizer_model_path: null
	tokenizer_name: TiktokenTokenizer
	tp_comm_atomic_ag: false
	tp_comm_atomic_rs: false
	tp_comm_bootstrap_backend: nccl
	tp_comm_bulk_dgrad: true
	tp_comm_bulk_wgrad: true
	tp_comm_overlap: false
	tp_comm_overlap_ag: true
	tp_comm_overlap_disable_fc1: false
	tp_comm_overlap_disable_qkv: false
	tp_comm_overlap_rs: true
	tp_comm_overlap_rs_dgrad: false
	tp_comm_split_ag: true
	tp_comm_split_rs: true
	tp_only_amax_red: false
	use_cpu_initialization: false
	use_custom_fsdp: false
	use_ring_exchange_p2p: false
	use_te_rng_tracker: false
	variable_seq_lengths: false
	virtual_pipeline_model_parallel_size: null
	vocab_file: null
	vocab_size: 131072
	wgrad_deferral_limit: 0
	window_size: null
	model_transform: null
	optim:
	_target_: nemo.lightning.pytorch.optim.megatron.MegatronOptimizerModule
	config:
	_target_: megatron.core.optimizer.optimizer_config.OptimizerConfig
	adam_beta1: 0.9
	adam_beta2: 0.999
	adam_eps: 1.0e-08
	barrier_with_L1_time: false
	bf16: false
	clip_grad: 1.0
	config_logger_dir: ''
	decoupled_lr: null
	decoupled_min_lr: null
	exp_avg_dtype:
	_call_: false
	_target_: torch.float32
	exp_avg_sq_dtype:
	_call_: false
	_target_: torch.float32
	fp16: false
	hysteresis: 2
	initial_loss_scale: 4294967296
	log_num_zeros_in_grad: false
	loss_scale: null
	loss_scale_window: 1000
	lr: 0.0001
	main_grads_dtype:
	_call_: false
	_target_: torch.float32
	main_params_dtype:
	_call_: false
	_target_: torch.float32
	min_loss_scale: 1.0
	min_lr: null
	optimizer: adam
	optimizer_cpu_offload: false
	optimizer_offload_fraction: 0.0
	overlap_cpu_optimizer_d2h_h2d: false
	overlap_param_gather_with_optimizer_step: false
	params_dtype:
	_call_: false
	_target_: torch.float32
	pin_cpu_grads: true
	pin_cpu_params: true
	sgd_momentum: 0.9
	timers: null
	use_distributed_optimizer: true
	use_precision_aware_optimizer: false
	use_torch_optimizer_for_cpu_offload: false
	weight_decay: 0.01
	lr_mult: 1.0
	lr_scheduler: null
	no_weight_decay_cond: null
	scale_lr_cond: null
	tokenizer:
	_target_: nemo.collections.common.tokenizers.huggingface.auto_tokenizer.AutoTokenizer
	additional_special_tokens: []
	bos_token: null
	cls_token: null
	eos_token: null
	include_special_tokens: false
	mask_token: null
	merges_file: null
	pad_token: null
	pretrained_model_name: nemo_tokenizer
	sep_token: null
	trust_remote_code: true
	unk_token: null
	use_fast: false
	vocab_file: null