File size: 1,227 Bytes
41dcf48 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
{
"_description": "Configuration for Sparse (Top-K) Mixture-of-Experts Model",
"_model_type": "MoeLM",
"model": {
"vocab_size": 10000,
"num_layers": 4,
"context_length": 256,
"d_model": 512,
"d_ff": 2048,
"num_heads": 8,
"theta": 10000.0
},
"moe": {
"num_experts": 8,
"top_k": 2,
"aux_loss_weight": 0.01
},
"optimizer": {
"learning_rate": 3e-4,
"alpha_min": 3e-5,
"beta1": 0.9,
"beta2": 0.95,
"eps": 1e-8,
"weight_decay": 0.1,
"max_grad_norm": 1.0
},
"scheduler": {
"warmup_steps": 2000,
"max_steps": 20000
},
"training": {
"batch_size": 8,
"eval_interval": 500,
"log_interval": 100,
"save_interval": 5000,
"eval_steps": 100
},
"paths": {
"train_data_path": "data/train.bin",
"val_data_path": "data/val.bin",
"checkpoint_dir": "checkpoints_moe",
"resume_from": null
},
"system": {
"device": "cuda"
},
"logging": {
"use_wandb": false,
"wandb_project": "transformer-moe-lm",
"wandb_run_name": null
}
} |