| { |
| "_name_or_path": "./MoLM-700M-4B", |
| "activation_function": "gelu_new", |
| "architectures": [ |
| "ModuleFormerForCausalLM" |
| ], |
| "att_func": "stickbreaking", |
| "att_hidden": 1024, |
| "attn_pdrop": 0, |
| "aux_loss_type": "mi", |
| "aux_loss_weight": 0, |
| "block_size": 512, |
| "bos_token_id": 50256, |
| "embd_pdrop": 0, |
| "eos_token_id": 50256, |
| "ffd_hidden": 2048, |
| "gate_type": "mlp", |
| "gating_size": 256, |
| "history_length": 512, |
| "initializer_range": 0.02, |
| "k_att": 4, |
| "k_mlp": 4, |
| "layer_norm_epsilon": 1e-05, |
| "local_size": 1, |
| "model_type": "moduleformer", |
| "moe_pdrop": 0, |
| "moe_type": "moe", |
| "n_att_experts": 16, |
| "n_ctx": 12288, |
| "n_embd": 1024, |
| "n_head": 1, |
| "n_layer": 24, |
| "n_mlp_experts": 32, |
| "pre_norm": true, |
| "resid_pdrop": 0, |
| "sample_topk": 0, |
| "tie_word_embeddings": false, |
| "torch_dtype": "bfloat16", |
| "transformers_version": "4.28.1", |
| "universal": false, |
| "use_cache": true, |
| "vocab_size": 50295, |
| "world_size": null |
| } |
|
|