File size: 1,682 Bytes
7557c9f a050405 5db5e42 edbb1a6 7557c9f 2120bf6 5db5e42 7557c9f 5db5e42 7557c9f 5db5e42 7557c9f 5db5e42 986639c 7557c9f 2120bf6 7557c9f f8b2ed5 7557c9f 5db5e42 1c3be6f 5db5e42 7557c9f edbb1a6 a050405 6b1c605 7557c9f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
{
"model": {
"max_batch_size": 16,
"max_seq_len": 512,
"dtype": "fp32",
"scale_fmt": null,
"vocab_size": 32768,
"dim": 512,
"inter_dim": 4096,
"moe_inter_dim": 512,
"n_layers": 16,
"n_dense_layers": 3,
"n_heads": 12,
"n_routed_experts": 4,
"n_shared_experts": 1,
"n_activated_experts": 2,
"route_scale": 1.0,
"use_routing_bias": true,
"q_lora_rank": 0,
"kv_lora_rank": 256,
"qk_nope_head_dim": 64,
"qk_rope_head_dim": 32,
"v_head_dim": 64,
"original_seq_len": 4096,
"rope_theta": 10000.0,
"rope_factor": 40,
"beta_fast": 32,
"beta_slow": 1,
"mscale": 1.0,
"tokenizer_name": "turkish"
},
"training": {
"learning_rate": 3e-5,
"weight_decay": 0.1,
"beta1": 0.9,
"beta2": 0.95,
"grad_clip": 1.0,
"warmup_steps": 1000,
"total_steps": 100000,
"use_checkpointing": false,
"expert_rotation_steps": 5000,
"gradient_accumulation_steps": 8,
"eval_every": 1000,
"save_every": 5000,
"save_dir": "./checkpoints",
"log_every": 100,
"dtype": "fp32",
"compile": false,
"max_val_batches": 50,
"val_batch_size_multiplier": 4,
"train_all_experts":false
},
"data": {
"train_file": "./data/train.txt",
"val_file": "./data/val.txt",
"stride": 512
},
"logging": {
"use_wandb": true,
"project_name": "sequential-moe",
"run_name": "moe-12gb-gpu"
}
}
|