ikaganacar's picture
Validation Dont work
6b1c605
{
"model": {
"max_batch_size": 16,
"max_seq_len": 512,
"dtype": "fp32",
"scale_fmt": null,
"vocab_size": 32768,
"dim": 512,
"inter_dim": 4096,
"moe_inter_dim": 512,
"n_layers": 16,
"n_dense_layers": 3,
"n_heads": 12,
"n_routed_experts": 4,
"n_shared_experts": 1,
"n_activated_experts": 2,
"route_scale": 1.0,
"use_routing_bias": true,
"q_lora_rank": 0,
"kv_lora_rank": 256,
"qk_nope_head_dim": 64,
"qk_rope_head_dim": 32,
"v_head_dim": 64,
"original_seq_len": 4096,
"rope_theta": 10000.0,
"rope_factor": 40,
"beta_fast": 32,
"beta_slow": 1,
"mscale": 1.0,
"tokenizer_name": "turkish"
},
"training": {
"learning_rate": 3e-5,
"weight_decay": 0.1,
"beta1": 0.9,
"beta2": 0.95,
"grad_clip": 1.0,
"warmup_steps": 1000,
"total_steps": 100000,
"use_checkpointing": false,
"expert_rotation_steps": 5000,
"gradient_accumulation_steps": 8,
"eval_every": 1000,
"save_every": 5000,
"save_dir": "./checkpoints",
"log_every": 100,
"dtype": "fp32",
"compile": false,
"max_val_batches": 50,
"val_batch_size_multiplier": 4,
"train_all_experts":false
},
"data": {
"train_file": "./data/train.txt",
"val_file": "./data/val.txt",
"stride": 512
},
"logging": {
"use_wandb": true,
"project_name": "sequential-moe",
"run_name": "moe-12gb-gpu"
}
}