{ "model": { "max_batch_size": 16, "max_seq_len": 512, "dtype": "fp32", "scale_fmt": null, "vocab_size": 32768, "dim": 512, "inter_dim": 4096, "moe_inter_dim": 512, "n_layers": 16, "n_dense_layers": 3, "n_heads": 12, "n_routed_experts": 4, "n_shared_experts": 1, "n_activated_experts": 2, "route_scale": 1.0, "use_routing_bias": true, "q_lora_rank": 0, "kv_lora_rank": 256, "qk_nope_head_dim": 64, "qk_rope_head_dim": 32, "v_head_dim": 64, "original_seq_len": 4096, "rope_theta": 10000.0, "rope_factor": 40, "beta_fast": 32, "beta_slow": 1, "mscale": 1.0, "tokenizer_name": "turkish" }, "training": { "learning_rate": 3e-5, "weight_decay": 0.1, "beta1": 0.9, "beta2": 0.95, "grad_clip": 1.0, "warmup_steps": 1000, "total_steps": 100000, "use_checkpointing": false, "expert_rotation_steps": 5000, "gradient_accumulation_steps": 8, "eval_every": 1000, "save_every": 5000, "save_dir": "./checkpoints", "log_every": 100, "dtype": "fp32", "compile": false, "max_val_batches": 50, "val_batch_size_multiplier": 4, "train_all_experts":false }, "data": { "train_file": "./data/train.txt", "val_file": "./data/val.txt", "stride": 512 }, "logging": { "use_wandb": true, "project_name": "sequential-moe", "run_name": "moe-12gb-gpu" } }