chatsd commited on
Commit
41dcf48
·
verified ·
1 Parent(s): f486215

Config Files

Browse files
Files changed (2) hide show
  1. dynamic_moe_config.json +55 -0
  2. sparse_moe_config.json +52 -0
dynamic_moe_config.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_description": "Configuration for Dynamic-K Mixture-of-Experts Model",
3
+ "_model_type": "DynamicMOELM",
4
+ "model": {
5
+ "vocab_size": 10000,
6
+ "num_layers": 4,
7
+ "context_length": 256,
8
+ "d_model": 512,
9
+ "d_ff": 2048,
10
+ "num_heads": 8,
11
+ "theta": 10000.0
12
+ },
13
+ "moe": {
14
+ "num_experts": 4,
15
+ "confidence_threshold": 0.8
16
+ },
17
+ "loss_weights": {
18
+ "balance_loss_weight": 0.01,
19
+ "entropy_loss_weight": 0.001
20
+ },
21
+ "optimizer": {
22
+ "learning_rate": 3e-4,
23
+ "beta1": 0.9,
24
+ "beta2": 0.95,
25
+ "eps": 1e-8,
26
+ "weight_decay": 0.1,
27
+ "max_grad_norm": 1.0
28
+ },
29
+ "scheduler": {
30
+ "warmup_steps": 2000,
31
+ "max_steps": 20000
32
+ },
33
+ "training": {
34
+ "batch_size": 4,
35
+ "grad_accum_steps": 1,
36
+ "eval_interval": 500,
37
+ "log_interval": 100,
38
+ "save_interval": 2000,
39
+ "eval_steps": 10
40
+ },
41
+ "paths": {
42
+ "train_data_path": "data/train.txt",
43
+ "val_data_path": "data/test.txt",
44
+ "checkpoint_dir": "checkpoints_dynamic_moe",
45
+ "resume_from": null
46
+ },
47
+ "system": {
48
+ "device": "cuda"
49
+ },
50
+ "logging": {
51
+ "use_wandb": true,
52
+ "wandb_project": "dynamic-moe-phase2",
53
+ "wandb_run_name": null
54
+ }
55
+ }
sparse_moe_config.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_description": "Configuration for Sparse (Top-K) Mixture-of-Experts Model",
3
+ "_model_type": "MoeLM",
4
+ "model": {
5
+ "vocab_size": 10000,
6
+ "num_layers": 4,
7
+ "context_length": 256,
8
+ "d_model": 512,
9
+ "d_ff": 2048,
10
+ "num_heads": 8,
11
+ "theta": 10000.0
12
+ },
13
+ "moe": {
14
+ "num_experts": 8,
15
+ "top_k": 2,
16
+ "aux_loss_weight": 0.01
17
+ },
18
+ "optimizer": {
19
+ "learning_rate": 3e-4,
20
+ "alpha_min": 3e-5,
21
+ "beta1": 0.9,
22
+ "beta2": 0.95,
23
+ "eps": 1e-8,
24
+ "weight_decay": 0.1,
25
+ "max_grad_norm": 1.0
26
+ },
27
+ "scheduler": {
28
+ "warmup_steps": 2000,
29
+ "max_steps": 20000
30
+ },
31
+ "training": {
32
+ "batch_size": 8,
33
+ "eval_interval": 500,
34
+ "log_interval": 100,
35
+ "save_interval": 5000,
36
+ "eval_steps": 100
37
+ },
38
+ "paths": {
39
+ "train_data_path": "data/train.bin",
40
+ "val_data_path": "data/val.bin",
41
+ "checkpoint_dir": "checkpoints_moe",
42
+ "resume_from": null
43
+ },
44
+ "system": {
45
+ "device": "cuda"
46
+ },
47
+ "logging": {
48
+ "use_wandb": false,
49
+ "wandb_project": "transformer-moe-lm",
50
+ "wandb_run_name": null
51
+ }
52
+ }