chatsd
/

Sparse_Dynamic_MOE

Text Generation

mixture-of-experts

Mixture of Experts

conditional-computation

Model card Files Files and versions

Sparse_Dynamic_MOE / sparse_moe_config.json

chatsd's picture

Config Files

41dcf48 verified 4 months ago

1.23 kB

	{
	"_description": "Configuration for Sparse (Top-K) Mixture-of-Experts Model",
	"_model_type": "MoeLM",
	"model": {
	"vocab_size": 10000,
	"num_layers": 4,
	"context_length": 256,
	"d_model": 512,
	"d_ff": 2048,
	"num_heads": 8,
	"theta": 10000.0
	},
	"moe": {
	"num_experts": 8,
	"top_k": 2,
	"aux_loss_weight": 0.01
	},
	"optimizer": {
	"learning_rate": 3e-4,
	"alpha_min": 3e-5,
	"beta1": 0.9,
	"beta2": 0.95,
	"eps": 1e-8,
	"weight_decay": 0.1,
	"max_grad_norm": 1.0
	},
	"scheduler": {
	"warmup_steps": 2000,
	"max_steps": 20000
	},
	"training": {
	"batch_size": 8,
	"eval_interval": 500,
	"log_interval": 100,
	"save_interval": 5000,
	"eval_steps": 100
	},
	"paths": {
	"train_data_path": "data/train.bin",
	"val_data_path": "data/val.bin",
	"checkpoint_dir": "checkpoints_moe",
	"resume_from": null
	},
	"system": {
	"device": "cuda"
	},
	"logging": {
	"use_wandb": false,
	"wandb_project": "transformer-moe-lm",
	"wandb_run_name": null
	}
	}