Mamba_500M / config.json
yagizdevre's picture
model is added
be761d6
{
"model_type": "minimamba",
"_name_or_path": "Mamba_500M",
"architectures": ["MiniMamba"],
"dim": 1024,
"num_layers": 54,
"num_heads": 32,
"state_dim": 128,
"num_groups": 1,
"conv_size": 4,
"use_mem_eff_path": true,
"dt_bias": true,
"D_has_head_dim": true,
"learnable_init_states": false,
"ssm_chunk_size": 256,
"vocab_size": 200064,
"ffn_dim_multiplier": 2.0,
"multiple_of": 256,
"norm_eps": 1e-5,
"init_use_depth": false,
"init_base_std": null,
"init_std_factor": "disabled",
"hidden_act": "silu",
"bias": false,
"torch_dtype": "bfloat16",
"seed": 1337,
"init_args": {
"dt_max": 0.1,
"dt_min": 0.001,
"dt_init_floor": 1e-4,
"A_init_min": 0.01,
"A_init_max": 16
},
"seq_len": 8192,
"weight_tying": false,
"dropout": 0.0,
"num_epochs": 1,
"global_bsz": 524288,
"bsz": 1,
"warmup_steps": 1907,
"eval_period": 50,
"save_period": 500,
"max_lr": 3.0e-4,
"min_lr": 3.0e-5,
"max_norm": 1.0,
"dilation": 1,
"fsdp": true,
"ddp": false,
"mixed_precision": true,
"cpu_offload": false,
"sharding_strategy": "full_shard",
"state_dict_type": "full",
"auto_wrap_policy": "partial",
"backward_prefetch": "backward_pre",
"forward_prefetch": false,
"sync_module_states": true,
"use_orig_params": true,
"device_id": null,
"precision": {
"param": "bfloat16",
"reduce": "bfloat16",
"buffer": "bfloat16"
},
"fsdp_modules": [
"MambaBlock"
],
"use_activation_checkpointing": true,
"use_attn": false,
"softcap": 50.0,
"torch_compile": false
}