File size: 1,718 Bytes
9991887
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
{
    "model_type": "minimamba",
    "_name_or_path": "Mamba_500M",
    "architectures": ["MiniMamba"],
    "dim": 1024,
    "num_layers": 54,
    "num_heads": 32,
    "state_dim": 128,
    "num_groups": 1,
    "conv_size": 4,
    "use_mem_eff_path": true,
    "dt_bias": true,
    "D_has_head_dim": true,
    "learnable_init_states": false,
    "ssm_chunk_size": 256,
    "vocab_size": 200064,
    "ffn_dim_multiplier": 2.0,
    "multiple_of": 256,
    "norm_eps": 1e-05,
    "init_use_depth": false,
    "init_base_std": null,
    "init_std_factor": "disabled",
    "hidden_act": "silu",
    "bias": false,
    "torch_dtype": "bfloat16",
    "seed": 1337,
    "init_args": {
      "dt_max": 0.1,
      "dt_min": 0.001,
      "dt_init_floor": 0.0001,
      "A_init_min": 0.01,
      "A_init_max": 16
    },
    "seq_len": 8192,
    "weight_tying": true,
    "dropout": 0.0,
    "num_epochs": 1,
    "global_bsz": 524288,
    "bsz": 1,
    "warmup_steps": 1907,
    "eval_period": 50,
    "save_period": 500,
    "max_lr": 0.0003,
    "min_lr": 3e-05,
    "max_norm": 1.0,
    "dilation": 1,
    "fsdp": true,
    "ddp": false,
    "mixed_precision": true,
    "cpu_offload": false,
    "sharding_strategy": "full_shard",
    "state_dict_type": "full",
    "auto_wrap_policy": "partial",
    "backward_prefetch": "backward_pre",
    "forward_prefetch": false,
    "sync_module_states": true,
    "use_orig_params": true,
    "device_id": null,
    "precision": {
      "param": "bfloat16",
      "reduce": "bfloat16",
      "buffer": "bfloat16"
    },
    "fsdp_modules": [
      "MambaBlock"
    ],
    "use_activation_checkpointing": true,
    "use_attn": false,
    "softcap": 50.0,
    "torch_compile": true
  }