| { | |
| "model_type": "minimamba", | |
| "_name_or_path": "Mamba_500M", | |
| "architectures": ["MiniMamba"], | |
| "dim": 1024, | |
| "num_layers": 54, | |
| "num_heads": 32, | |
| "state_dim": 128, | |
| "num_groups": 1, | |
| "conv_size": 4, | |
| "use_mem_eff_path": true, | |
| "dt_bias": true, | |
| "D_has_head_dim": true, | |
| "learnable_init_states": false, | |
| "ssm_chunk_size": 256, | |
| "vocab_size": 200064, | |
| "ffn_dim_multiplier": 2.0, | |
| "multiple_of": 256, | |
| "norm_eps": 1e-5, | |
| "init_use_depth": false, | |
| "init_base_std": null, | |
| "init_std_factor": "disabled", | |
| "hidden_act": "silu", | |
| "bias": false, | |
| "torch_dtype": "bfloat16", | |
| "seed": 1337, | |
| "init_args": { | |
| "dt_max": 0.1, | |
| "dt_min": 0.001, | |
| "dt_init_floor": 1e-4, | |
| "A_init_min": 0.01, | |
| "A_init_max": 16 | |
| }, | |
| "seq_len": 8192, | |
| "weight_tying": false, | |
| "dropout": 0.0, | |
| "num_epochs": 1, | |
| "global_bsz": 524288, | |
| "bsz": 1, | |
| "warmup_steps": 1907, | |
| "eval_period": 50, | |
| "save_period": 500, | |
| "max_lr": 3.0e-4, | |
| "min_lr": 3.0e-5, | |
| "max_norm": 1.0, | |
| "dilation": 1, | |
| "fsdp": true, | |
| "ddp": false, | |
| "mixed_precision": true, | |
| "cpu_offload": false, | |
| "sharding_strategy": "full_shard", | |
| "state_dict_type": "full", | |
| "auto_wrap_policy": "partial", | |
| "backward_prefetch": "backward_pre", | |
| "forward_prefetch": false, | |
| "sync_module_states": true, | |
| "use_orig_params": true, | |
| "device_id": null, | |
| "precision": { | |
| "param": "bfloat16", | |
| "reduce": "bfloat16", | |
| "buffer": "bfloat16" | |
| }, | |
| "fsdp_modules": [ | |
| "MambaBlock" | |
| ], | |
| "use_activation_checkpointing": true, | |
| "use_attn": false, | |
| "softcap": 50.0, | |
| "torch_compile": false | |
| } |