File size: 1,564 Bytes
afaf33f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 | {
"alpha_init": 1.5,
"architectures": [
"MoAMetricLM"
],
"attn_drop": 0.1,
"attn_heads": 16,
"bos_token_id": 0,
"conv_kernel": 5,
"conv_mult": 2,
"dim": 512,
"discrepancy_modulation": true,
"drop_path": 0.0,
"dtype": "float32",
"enable_feature_gates": true,
"enable_router_gates": true,
"energy_amplification": 3.1415,
"eos_token_id": 0,
"ff_mult": 3,
"ffn_hidden": 1536,
"hidden_size": 512,
"intermediate_size": 1536,
"layer_scale_init_value": 0.0001,
"learn_alpha": true,
"learn_radius": true,
"lm_attn_heads": 16,
"lm_ffn_hidden": 1536,
"lm_intermediate_size": 1536,
"lm_mixer_hidden": 768,
"lm_mqa_q_heads": 16,
"lm_num_attention_heads": 16,
"lm_num_key_value_heads": 16,
"lm_proj_drop": 0.1,
"lm_router_dropout": 0.1,
"lm_router_hidden": 128,
"lm_router_temperature": 1.5,
"lr_rank": 32,
"maha_init": 1.0,
"max_position_embeddings": 2048,
"max_seq_len_cached": 2048,
"metric": "maha_diag",
"mixer_hidden": 768,
"model_type": "moa_metric",
"mqa_q_heads": 16,
"n_branches": 3,
"num_attention_heads": 16,
"num_hidden_layers": 4,
"num_key_value_heads": 16,
"num_layers": 4,
"origin_init_scale": 0.0,
"pad_token_id": 0,
"proj_drop": 0.1,
"r_basis": 16,
"radius_init": 3.5,
"router_dropout": 0.1,
"router_hidden": 128,
"router_temperature": 2.0,
"router_topk": 2,
"theta_base": 10000.0,
"ti_reg_samples": 16,
"ti_reg_weight": 0.01,
"tie_word_embeddings": true,
"transformers_version": "5.0.0",
"use_balls": true,
"vocab_size": 50277
}
|