chatsd
/

Sparse_Dynamic_MOE

Text Generation

mixture-of-experts

Mixture of Experts

conditional-computation

Model card Files Files and versions

chatsd commited on Dec 25, 2025

Commit

41dcf48

·

verified ·

1 Parent(s): f486215

Config Files

Files changed (2) hide show

dynamic_moe_config.json +55 -0
sparse_moe_config.json +52 -0

dynamic_moe_config.json ADDED Viewed

	@@ -0,0 +1,55 @@

+{
+    "_description": "Configuration for Dynamic-K Mixture-of-Experts Model",
+    "_model_type": "DynamicMOELM",
+    "model": {
+        "vocab_size": 10000,
+        "num_layers": 4,
+        "context_length": 256,
+        "d_model": 512,
+        "d_ff": 2048,
+        "num_heads": 8,
+        "theta": 10000.0
+    },
+    "moe": {
+        "num_experts": 4,
+        "confidence_threshold": 0.8
+    },
+    "loss_weights": {
+        "balance_loss_weight": 0.01,
+        "entropy_loss_weight": 0.001
+    },
+    "optimizer": {
+        "learning_rate": 3e-4,
+        "beta1": 0.9,
+        "beta2": 0.95,
+        "eps": 1e-8,
+        "weight_decay": 0.1,
+        "max_grad_norm": 1.0
+    },
+    "scheduler": {
+        "warmup_steps": 2000,
+        "max_steps": 20000
+    },
+    "training": {
+        "batch_size": 4,
+        "grad_accum_steps": 1,
+        "eval_interval": 500,
+        "log_interval": 100,
+        "save_interval": 2000,
+        "eval_steps": 10
+    },
+    "paths": {
+        "train_data_path": "data/train.txt",
+        "val_data_path": "data/test.txt",
+        "checkpoint_dir": "checkpoints_dynamic_moe",
+        "resume_from": null
+    },
+    "system": {
+        "device": "cuda"
+    },
+    "logging": {
+        "use_wandb": true,
+        "wandb_project": "dynamic-moe-phase2",
+        "wandb_run_name": null
+    }
+}

sparse_moe_config.json ADDED Viewed

	@@ -0,0 +1,52 @@

+{
+    "_description": "Configuration for Sparse (Top-K) Mixture-of-Experts Model",
+    "_model_type": "MoeLM",
+    "model": {
+        "vocab_size": 10000,
+        "num_layers": 4,
+        "context_length": 256,
+        "d_model": 512,
+        "d_ff": 2048,
+        "num_heads": 8,
+        "theta": 10000.0
+    },
+    "moe": {
+        "num_experts": 8,
+        "top_k": 2,
+        "aux_loss_weight": 0.01
+    },
+    "optimizer": {
+        "learning_rate": 3e-4,
+        "alpha_min": 3e-5,
+        "beta1": 0.9,
+        "beta2": 0.95,
+        "eps": 1e-8,
+        "weight_decay": 0.1,
+        "max_grad_norm": 1.0
+    },
+    "scheduler": {
+        "warmup_steps": 2000,
+        "max_steps": 20000
+    },
+    "training": {
+        "batch_size": 8,
+        "eval_interval": 500,
+        "log_interval": 100,
+        "save_interval": 5000,
+        "eval_steps": 100
+    },
+    "paths": {
+        "train_data_path": "data/train.bin",
+        "val_data_path": "data/val.bin",
+        "checkpoint_dir": "checkpoints_moe",
+        "resume_from": null
+    },
+    "system": {
+        "device": "cuda"
+    },
+    "logging": {
+        "use_wandb": false,
+        "wandb_project": "transformer-moe-lm",
+        "wandb_run_name": null
+    }
+}