| { | |
| "model": { | |
| "max_batch_size": 16, | |
| "max_seq_len": 512, | |
| "dtype": "fp32", | |
| "scale_fmt": null, | |
| "vocab_size": 32768, | |
| "dim": 512, | |
| "inter_dim": 4096, | |
| "moe_inter_dim": 512, | |
| "n_layers": 16, | |
| "n_dense_layers": 3, | |
| "n_heads": 12, | |
| "n_routed_experts": 4, | |
| "n_shared_experts": 1, | |
| "n_activated_experts": 2, | |
| "route_scale": 1.0, | |
| "use_routing_bias": true, | |
| "q_lora_rank": 0, | |
| "kv_lora_rank": 256, | |
| "qk_nope_head_dim": 64, | |
| "qk_rope_head_dim": 32, | |
| "v_head_dim": 64, | |
| "original_seq_len": 4096, | |
| "rope_theta": 10000.0, | |
| "rope_factor": 40, | |
| "beta_fast": 32, | |
| "beta_slow": 1, | |
| "mscale": 1.0, | |
| "tokenizer_name": "turkish" | |
| }, | |
| "training": { | |
| "learning_rate": 3e-5, | |
| "weight_decay": 0.1, | |
| "beta1": 0.9, | |
| "beta2": 0.95, | |
| "grad_clip": 1.0, | |
| "warmup_steps": 1000, | |
| "total_steps": 100000, | |
| "use_checkpointing": false, | |
| "expert_rotation_steps": 5000, | |
| "gradient_accumulation_steps": 8, | |
| "eval_every": 1000, | |
| "save_every": 5000, | |
| "save_dir": "./checkpoints", | |
| "log_every": 100, | |
| "dtype": "fp32", | |
| "compile": false, | |
| "max_val_batches": 50, | |
| "val_batch_size_multiplier": 4, | |
| "train_all_experts":false | |
| }, | |
| "data": { | |
| "train_file": "./data/train.txt", | |
| "val_file": "./data/val.txt", | |
| "stride": 512 | |
| }, | |
| "logging": { | |
| "use_wandb": true, | |
| "project_name": "sequential-moe", | |
| "run_name": "moe-12gb-gpu" | |
| } | |
| } | |