File size: 1,682 Bytes
7557c9f
 
a050405
5db5e42
edbb1a6
7557c9f
2120bf6
5db5e42
7557c9f
5db5e42
 
7557c9f
 
5db5e42
7557c9f
 
 
 
 
5db5e42
986639c
 
 
7557c9f
 
 
 
 
 
2120bf6
7557c9f
 
f8b2ed5
7557c9f
 
 
 
 
5db5e42
1c3be6f
5db5e42
 
7557c9f
 
 
 
edbb1a6
a050405
 
6b1c605
 
7557c9f
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
{
    "model": {
        "max_batch_size": 16,
        "max_seq_len": 512,
        "dtype": "fp32",
        "scale_fmt": null,
        "vocab_size": 32768,
        "dim": 512,
        "inter_dim": 4096,
        "moe_inter_dim": 512,
        "n_layers": 16,
        "n_dense_layers": 3,
        "n_heads": 12,
        "n_routed_experts": 4,
        "n_shared_experts": 1,
        "n_activated_experts": 2,
        "route_scale": 1.0,
        "use_routing_bias": true,
        "q_lora_rank": 0,
        "kv_lora_rank": 256,
        "qk_nope_head_dim": 64,  
        "qk_rope_head_dim": 32,
        "v_head_dim": 64,
        "original_seq_len": 4096,
        "rope_theta": 10000.0,
        "rope_factor": 40,
        "beta_fast": 32,
        "beta_slow": 1,
        "mscale": 1.0,
        "tokenizer_name": "turkish"
    },
    "training": {
        "learning_rate": 3e-5,
        "weight_decay": 0.1,
        "beta1": 0.9,
        "beta2": 0.95,
        "grad_clip": 1.0,
        "warmup_steps": 1000,
        "total_steps": 100000,
        "use_checkpointing": false,
        "expert_rotation_steps": 5000,
        "gradient_accumulation_steps": 8,
        "eval_every": 1000,
        "save_every": 5000,
        "save_dir": "./checkpoints",
        "log_every": 100,
        "dtype": "fp32",
        "compile": false,
        "max_val_batches": 50,
        "val_batch_size_multiplier": 4,
        "train_all_experts":false
    },
    "data": {
        "train_file": "./data/train.txt",
        "val_file": "./data/val.txt",
        "stride": 512
    },
    "logging": {
        "use_wandb": true,
        "project_name": "sequential-moe",
        "run_name": "moe-12gb-gpu"
    }
}