Sparse_Dynamic_MOE / sparse_moe_config.json
chatsd's picture
Config Files
41dcf48 verified
{
"_description": "Configuration for Sparse (Top-K) Mixture-of-Experts Model",
"_model_type": "MoeLM",
"model": {
"vocab_size": 10000,
"num_layers": 4,
"context_length": 256,
"d_model": 512,
"d_ff": 2048,
"num_heads": 8,
"theta": 10000.0
},
"moe": {
"num_experts": 8,
"top_k": 2,
"aux_loss_weight": 0.01
},
"optimizer": {
"learning_rate": 3e-4,
"alpha_min": 3e-5,
"beta1": 0.9,
"beta2": 0.95,
"eps": 1e-8,
"weight_decay": 0.1,
"max_grad_norm": 1.0
},
"scheduler": {
"warmup_steps": 2000,
"max_steps": 20000
},
"training": {
"batch_size": 8,
"eval_interval": 500,
"log_interval": 100,
"save_interval": 5000,
"eval_steps": 100
},
"paths": {
"train_data_path": "data/train.bin",
"val_data_path": "data/val.bin",
"checkpoint_dir": "checkpoints_moe",
"resume_from": null
},
"system": {
"device": "cuda"
},
"logging": {
"use_wandb": false,
"wandb_project": "transformer-moe-lm",
"wandb_run_name": null
}
}