{ "_description": "Configuration for Sparse (Top-K) Mixture-of-Experts Model", "_model_type": "MoeLM", "model": { "vocab_size": 10000, "num_layers": 4, "context_length": 256, "d_model": 512, "d_ff": 2048, "num_heads": 8, "theta": 10000.0 }, "moe": { "num_experts": 8, "top_k": 2, "aux_loss_weight": 0.01 }, "optimizer": { "learning_rate": 3e-4, "alpha_min": 3e-5, "beta1": 0.9, "beta2": 0.95, "eps": 1e-8, "weight_decay": 0.1, "max_grad_norm": 1.0 }, "scheduler": { "warmup_steps": 2000, "max_steps": 20000 }, "training": { "batch_size": 8, "eval_interval": 500, "log_interval": 100, "save_interval": 5000, "eval_steps": 100 }, "paths": { "train_data_path": "data/train.bin", "val_data_path": "data/val.bin", "checkpoint_dir": "checkpoints_moe", "resume_from": null }, "system": { "device": "cuda" }, "logging": { "use_wandb": false, "wandb_project": "transformer-moe-lm", "wandb_run_name": null } }