{ "model_type": "minimamba", "_name_or_path": "Mamba_500M", "architectures": ["MiniMamba"], "dim": 1024, "num_layers": 54, "num_heads": 32, "state_dim": 128, "num_groups": 1, "conv_size": 4, "use_mem_eff_path": true, "dt_bias": true, "D_has_head_dim": true, "learnable_init_states": false, "ssm_chunk_size": 256, "vocab_size": 200064, "ffn_dim_multiplier": 2.0, "multiple_of": 256, "norm_eps": 1e-5, "init_use_depth": false, "init_base_std": null, "init_std_factor": "disabled", "hidden_act": "silu", "bias": false, "torch_dtype": "bfloat16", "seed": 1337, "init_args": { "dt_max": 0.1, "dt_min": 0.001, "dt_init_floor": 1e-4, "A_init_min": 0.01, "A_init_max": 16 }, "seq_len": 8192, "weight_tying": false, "dropout": 0.0, "num_epochs": 1, "global_bsz": 524288, "bsz": 1, "warmup_steps": 1907, "eval_period": 50, "save_period": 500, "max_lr": 3.0e-4, "min_lr": 3.0e-5, "max_norm": 1.0, "dilation": 1, "fsdp": true, "ddp": false, "mixed_precision": true, "cpu_offload": false, "sharding_strategy": "full_shard", "state_dict_type": "full", "auto_wrap_policy": "partial", "backward_prefetch": "backward_pre", "forward_prefetch": false, "sync_module_states": true, "use_orig_params": true, "device_id": null, "precision": { "param": "bfloat16", "reduce": "bfloat16", "buffer": "bfloat16" }, "fsdp_modules": [ "MambaBlock" ], "use_activation_checkpointing": true, "use_attn": false, "softcap": 50.0, "torch_compile": false }